2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21 * Author: Fenghua Yu <fenghua.yu@intel.com>
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
42 #define ROOT_SIZE VTD_PAGE_SIZE
43 #define CONTEXT_SIZE VTD_PAGE_SIZE
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48 #define IOAPIC_RANGE_START (0xfee00000)
49 #define IOAPIC_RANGE_END (0xfeefffff)
50 #define IOVA_START_ADDR (0x1000)
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56 #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN IOVA_PFN(DMA_64BIT_MASK)
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
66 * 12-63: Context Ptr (12 - (haw-1))
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
76 return (root->val & 1);
78 static inline void set_root_present(struct root_entry *root)
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 root->val |= value & VTD_PAGE_MASK;
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
90 return (struct context_entry *)
91 (root_present(root)?phys_to_virt(
92 root->val & VTD_PAGE_MASK) :
99 * 1: fault processing disable
100 * 2-3: translation type
101 * 12-63: address space root
107 struct context_entry {
112 static inline bool context_present(struct context_entry *context)
114 return (context->lo & 1);
116 static inline void context_set_present(struct context_entry *context)
121 static inline void context_set_fault_enable(struct context_entry *context)
123 context->lo &= (((u64)-1) << 2) | 1;
126 #define CONTEXT_TT_MULTI_LEVEL 0
128 static inline void context_set_translation_type(struct context_entry *context,
131 context->lo &= (((u64)-1) << 4) | 3;
132 context->lo |= (value & 3) << 2;
135 static inline void context_set_address_root(struct context_entry *context,
138 context->lo |= value & VTD_PAGE_MASK;
141 static inline void context_set_address_width(struct context_entry *context,
144 context->hi |= value & 7;
147 static inline void context_set_domain_id(struct context_entry *context,
150 context->hi |= (value & ((1 << 16) - 1)) << 8;
153 static inline void context_clear_entry(struct context_entry *context)
165 * 12-63: Host physcial address
171 static inline void dma_clear_pte(struct dma_pte *pte)
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 pte->val |= DMA_PTE_READ;
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 pte->val |= DMA_PTE_WRITE;
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 pte->val = (pte->val & ~3) | (prot & 3);
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 return (pte->val & VTD_PAGE_MASK);
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 pte->val |= (addr & VTD_PAGE_MASK);
201 static inline bool dma_pte_present(struct dma_pte *pte)
203 return (pte->val & 3) != 0;
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
209 /* domain represents a virtual machine, more than one devices
210 * across iommus may be owned in one domain, e.g. kvm guest.
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1)
215 int id; /* domain id */
216 unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/
218 struct list_head devices; /* all devices' list */
219 struct iova_domain iovad; /* iova's that belong to this domain */
221 struct dma_pte *pgd; /* virtual address */
222 spinlock_t mapping_lock; /* page table lock */
223 int gaw; /* max guest address width */
225 /* adjusted guest address width, 0 is level 2 30-bit */
228 int flags; /* flags to find out type of domain */
230 int iommu_coherency;/* indicate coherency of iommu access */
231 int iommu_count; /* reference count of iommu */
232 spinlock_t iommu_lock; /* protect iommu set in domain */
233 u64 max_addr; /* maximum mapped address */
236 /* PCI domain-device relationship */
237 struct device_domain_info {
238 struct list_head link; /* link to domain siblings */
239 struct list_head global; /* link to global list */
240 u8 bus; /* PCI bus numer */
241 u8 devfn; /* PCI devfn number */
242 struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
243 struct dmar_domain *domain; /* pointer to domain */
246 static void flush_unmaps_timeout(unsigned long data);
248 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
250 #define HIGH_WATER_MARK 250
251 struct deferred_flush_tables {
253 struct iova *iova[HIGH_WATER_MARK];
254 struct dmar_domain *domain[HIGH_WATER_MARK];
257 static struct deferred_flush_tables *deferred_flush;
259 /* bitmap for indexing intel_iommus */
260 static int g_num_of_iommus;
262 static DEFINE_SPINLOCK(async_umap_flush_lock);
263 static LIST_HEAD(unmaps_to_do);
266 static long list_size;
268 static void domain_remove_dev_info(struct dmar_domain *domain);
271 static int __initdata dmar_map_gfx = 1;
272 static int dmar_forcedac;
273 static int intel_iommu_strict;
275 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
276 static DEFINE_SPINLOCK(device_domain_lock);
277 static LIST_HEAD(device_domain_list);
279 static int __init intel_iommu_setup(char *str)
284 if (!strncmp(str, "off", 3)) {
286 printk(KERN_INFO"Intel-IOMMU: disabled\n");
287 } else if (!strncmp(str, "igfx_off", 8)) {
290 "Intel-IOMMU: disable GFX device mapping\n");
291 } else if (!strncmp(str, "forcedac", 8)) {
293 "Intel-IOMMU: Forcing DAC for PCI devices\n");
295 } else if (!strncmp(str, "strict", 6)) {
297 "Intel-IOMMU: disable batched IOTLB flush\n");
298 intel_iommu_strict = 1;
301 str += strcspn(str, ",");
307 __setup("intel_iommu=", intel_iommu_setup);
309 static struct kmem_cache *iommu_domain_cache;
310 static struct kmem_cache *iommu_devinfo_cache;
311 static struct kmem_cache *iommu_iova_cache;
313 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
318 /* trying to avoid low memory issues */
319 flags = current->flags & PF_MEMALLOC;
320 current->flags |= PF_MEMALLOC;
321 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
322 current->flags &= (~PF_MEMALLOC | flags);
327 static inline void *alloc_pgtable_page(void)
332 /* trying to avoid low memory issues */
333 flags = current->flags & PF_MEMALLOC;
334 current->flags |= PF_MEMALLOC;
335 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
336 current->flags &= (~PF_MEMALLOC | flags);
340 static inline void free_pgtable_page(void *vaddr)
342 free_page((unsigned long)vaddr);
345 static inline void *alloc_domain_mem(void)
347 return iommu_kmem_cache_alloc(iommu_domain_cache);
350 static void free_domain_mem(void *vaddr)
352 kmem_cache_free(iommu_domain_cache, vaddr);
355 static inline void * alloc_devinfo_mem(void)
357 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
360 static inline void free_devinfo_mem(void *vaddr)
362 kmem_cache_free(iommu_devinfo_cache, vaddr);
365 struct iova *alloc_iova_mem(void)
367 return iommu_kmem_cache_alloc(iommu_iova_cache);
370 void free_iova_mem(struct iova *iova)
372 kmem_cache_free(iommu_iova_cache, iova);
376 static inline int width_to_agaw(int width);
378 /* calculate agaw for each iommu.
379 * "SAGAW" may be different across iommus, use a default agaw, and
380 * get a supported less agaw for iommus that don't support the default agaw.
382 int iommu_calculate_agaw(struct intel_iommu *iommu)
387 sagaw = cap_sagaw(iommu->cap);
388 for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
390 if (test_bit(agaw, &sagaw))
397 /* in native case, each domain is related to only one iommu */
398 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
402 BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
404 iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
405 if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
408 return g_iommus[iommu_id];
411 /* "Coherency" capability may be different across iommus */
412 static void domain_update_iommu_coherency(struct dmar_domain *domain)
416 domain->iommu_coherency = 1;
418 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
419 for (; i < g_num_of_iommus; ) {
420 if (!ecap_coherent(g_iommus[i]->ecap)) {
421 domain->iommu_coherency = 0;
424 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
428 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
430 struct dmar_drhd_unit *drhd = NULL;
433 for_each_drhd_unit(drhd) {
437 for (i = 0; i < drhd->devices_cnt; i++)
438 if (drhd->devices[i]->bus->number == bus &&
439 drhd->devices[i]->devfn == devfn)
442 if (drhd->include_all)
449 static void domain_flush_cache(struct dmar_domain *domain,
450 void *addr, int size)
452 if (!domain->iommu_coherency)
453 clflush_cache_range(addr, size);
456 /* Gets context entry for a given bus and devfn */
457 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
460 struct root_entry *root;
461 struct context_entry *context;
462 unsigned long phy_addr;
465 spin_lock_irqsave(&iommu->lock, flags);
466 root = &iommu->root_entry[bus];
467 context = get_context_addr_from_root(root);
469 context = (struct context_entry *)alloc_pgtable_page();
471 spin_unlock_irqrestore(&iommu->lock, flags);
474 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
475 phy_addr = virt_to_phys((void *)context);
476 set_root_value(root, phy_addr);
477 set_root_present(root);
478 __iommu_flush_cache(iommu, root, sizeof(*root));
480 spin_unlock_irqrestore(&iommu->lock, flags);
481 return &context[devfn];
484 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
486 struct root_entry *root;
487 struct context_entry *context;
491 spin_lock_irqsave(&iommu->lock, flags);
492 root = &iommu->root_entry[bus];
493 context = get_context_addr_from_root(root);
498 ret = context_present(&context[devfn]);
500 spin_unlock_irqrestore(&iommu->lock, flags);
504 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
506 struct root_entry *root;
507 struct context_entry *context;
510 spin_lock_irqsave(&iommu->lock, flags);
511 root = &iommu->root_entry[bus];
512 context = get_context_addr_from_root(root);
514 context_clear_entry(&context[devfn]);
515 __iommu_flush_cache(iommu, &context[devfn], \
518 spin_unlock_irqrestore(&iommu->lock, flags);
521 static void free_context_table(struct intel_iommu *iommu)
523 struct root_entry *root;
526 struct context_entry *context;
528 spin_lock_irqsave(&iommu->lock, flags);
529 if (!iommu->root_entry) {
532 for (i = 0; i < ROOT_ENTRY_NR; i++) {
533 root = &iommu->root_entry[i];
534 context = get_context_addr_from_root(root);
536 free_pgtable_page(context);
538 free_pgtable_page(iommu->root_entry);
539 iommu->root_entry = NULL;
541 spin_unlock_irqrestore(&iommu->lock, flags);
544 /* page table handling */
545 #define LEVEL_STRIDE (9)
546 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
548 static inline int agaw_to_level(int agaw)
553 static inline int agaw_to_width(int agaw)
555 return 30 + agaw * LEVEL_STRIDE;
559 static inline int width_to_agaw(int width)
561 return (width - 30) / LEVEL_STRIDE;
564 static inline unsigned int level_to_offset_bits(int level)
566 return (12 + (level - 1) * LEVEL_STRIDE);
569 static inline int address_level_offset(u64 addr, int level)
571 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
574 static inline u64 level_mask(int level)
576 return ((u64)-1 << level_to_offset_bits(level));
579 static inline u64 level_size(int level)
581 return ((u64)1 << level_to_offset_bits(level));
584 static inline u64 align_to_level(u64 addr, int level)
586 return ((addr + level_size(level) - 1) & level_mask(level));
589 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
591 int addr_width = agaw_to_width(domain->agaw);
592 struct dma_pte *parent, *pte = NULL;
593 int level = agaw_to_level(domain->agaw);
597 BUG_ON(!domain->pgd);
599 addr &= (((u64)1) << addr_width) - 1;
600 parent = domain->pgd;
602 spin_lock_irqsave(&domain->mapping_lock, flags);
606 offset = address_level_offset(addr, level);
607 pte = &parent[offset];
611 if (!dma_pte_present(pte)) {
612 tmp_page = alloc_pgtable_page();
615 spin_unlock_irqrestore(&domain->mapping_lock,
619 domain_flush_cache(domain, tmp_page, PAGE_SIZE);
620 dma_set_pte_addr(pte, virt_to_phys(tmp_page));
622 * high level table always sets r/w, last level page
623 * table control read/write
625 dma_set_pte_readable(pte);
626 dma_set_pte_writable(pte);
627 domain_flush_cache(domain, pte, sizeof(*pte));
629 parent = phys_to_virt(dma_pte_addr(pte));
633 spin_unlock_irqrestore(&domain->mapping_lock, flags);
637 /* return address's pte at specific level */
638 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
641 struct dma_pte *parent, *pte = NULL;
642 int total = agaw_to_level(domain->agaw);
645 parent = domain->pgd;
646 while (level <= total) {
647 offset = address_level_offset(addr, total);
648 pte = &parent[offset];
652 if (!dma_pte_present(pte))
654 parent = phys_to_virt(dma_pte_addr(pte));
660 /* clear one page's page table */
661 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
663 struct dma_pte *pte = NULL;
665 /* get last level pte */
666 pte = dma_addr_level_pte(domain, addr, 1);
670 domain_flush_cache(domain, pte, sizeof(*pte));
674 /* clear last level pte, a tlb flush should be followed */
675 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
677 int addr_width = agaw_to_width(domain->agaw);
679 start &= (((u64)1) << addr_width) - 1;
680 end &= (((u64)1) << addr_width) - 1;
681 /* in case it's partial page */
682 start = PAGE_ALIGN(start);
685 /* we don't need lock here, nobody else touches the iova range */
686 while (start < end) {
687 dma_pte_clear_one(domain, start);
688 start += VTD_PAGE_SIZE;
692 /* free page table pages. last level pte should already be cleared */
693 static void dma_pte_free_pagetable(struct dmar_domain *domain,
696 int addr_width = agaw_to_width(domain->agaw);
698 int total = agaw_to_level(domain->agaw);
702 start &= (((u64)1) << addr_width) - 1;
703 end &= (((u64)1) << addr_width) - 1;
705 /* we don't need lock here, nobody else touches the iova range */
707 while (level <= total) {
708 tmp = align_to_level(start, level);
709 if (tmp >= end || (tmp + level_size(level) > end))
713 pte = dma_addr_level_pte(domain, tmp, level);
716 phys_to_virt(dma_pte_addr(pte)));
718 domain_flush_cache(domain, pte, sizeof(*pte));
720 tmp += level_size(level);
725 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
726 free_pgtable_page(domain->pgd);
732 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
734 struct root_entry *root;
737 root = (struct root_entry *)alloc_pgtable_page();
741 __iommu_flush_cache(iommu, root, ROOT_SIZE);
743 spin_lock_irqsave(&iommu->lock, flags);
744 iommu->root_entry = root;
745 spin_unlock_irqrestore(&iommu->lock, flags);
750 static void iommu_set_root_entry(struct intel_iommu *iommu)
756 addr = iommu->root_entry;
758 spin_lock_irqsave(&iommu->register_lock, flag);
759 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
761 cmd = iommu->gcmd | DMA_GCMD_SRTP;
762 writel(cmd, iommu->reg + DMAR_GCMD_REG);
764 /* Make sure hardware complete it */
765 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
766 readl, (sts & DMA_GSTS_RTPS), sts);
768 spin_unlock_irqrestore(&iommu->register_lock, flag);
771 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
776 if (!cap_rwbf(iommu->cap))
778 val = iommu->gcmd | DMA_GCMD_WBF;
780 spin_lock_irqsave(&iommu->register_lock, flag);
781 writel(val, iommu->reg + DMAR_GCMD_REG);
783 /* Make sure hardware complete it */
784 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
785 readl, (!(val & DMA_GSTS_WBFS)), val);
787 spin_unlock_irqrestore(&iommu->register_lock, flag);
790 /* return value determine if we need a write buffer flush */
791 static int __iommu_flush_context(struct intel_iommu *iommu,
792 u16 did, u16 source_id, u8 function_mask, u64 type,
793 int non_present_entry_flush)
799 * In the non-present entry flush case, if hardware doesn't cache
800 * non-present entry we do nothing and if hardware cache non-present
801 * entry, we flush entries of domain 0 (the domain id is used to cache
802 * any non-present entries)
804 if (non_present_entry_flush) {
805 if (!cap_caching_mode(iommu->cap))
812 case DMA_CCMD_GLOBAL_INVL:
813 val = DMA_CCMD_GLOBAL_INVL;
815 case DMA_CCMD_DOMAIN_INVL:
816 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
818 case DMA_CCMD_DEVICE_INVL:
819 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
820 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
827 spin_lock_irqsave(&iommu->register_lock, flag);
828 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
830 /* Make sure hardware complete it */
831 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
832 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
834 spin_unlock_irqrestore(&iommu->register_lock, flag);
836 /* flush context entry will implicitly flush write buffer */
840 /* return value determine if we need a write buffer flush */
841 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
842 u64 addr, unsigned int size_order, u64 type,
843 int non_present_entry_flush)
845 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
846 u64 val = 0, val_iva = 0;
850 * In the non-present entry flush case, if hardware doesn't cache
851 * non-present entry we do nothing and if hardware cache non-present
852 * entry, we flush entries of domain 0 (the domain id is used to cache
853 * any non-present entries)
855 if (non_present_entry_flush) {
856 if (!cap_caching_mode(iommu->cap))
863 case DMA_TLB_GLOBAL_FLUSH:
864 /* global flush doesn't need set IVA_REG */
865 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
867 case DMA_TLB_DSI_FLUSH:
868 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
870 case DMA_TLB_PSI_FLUSH:
871 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
872 /* Note: always flush non-leaf currently */
873 val_iva = size_order | addr;
878 /* Note: set drain read/write */
881 * This is probably to be super secure.. Looks like we can
882 * ignore it without any impact.
884 if (cap_read_drain(iommu->cap))
885 val |= DMA_TLB_READ_DRAIN;
887 if (cap_write_drain(iommu->cap))
888 val |= DMA_TLB_WRITE_DRAIN;
890 spin_lock_irqsave(&iommu->register_lock, flag);
891 /* Note: Only uses first TLB reg currently */
893 dmar_writeq(iommu->reg + tlb_offset, val_iva);
894 dmar_writeq(iommu->reg + tlb_offset + 8, val);
896 /* Make sure hardware complete it */
897 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
898 dmar_readq, (!(val & DMA_TLB_IVT)), val);
900 spin_unlock_irqrestore(&iommu->register_lock, flag);
902 /* check IOTLB invalidation granularity */
903 if (DMA_TLB_IAIG(val) == 0)
904 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
905 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
906 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
907 (unsigned long long)DMA_TLB_IIRG(type),
908 (unsigned long long)DMA_TLB_IAIG(val));
909 /* flush iotlb entry will implicitly flush write buffer */
913 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
914 u64 addr, unsigned int pages, int non_present_entry_flush)
918 BUG_ON(addr & (~VTD_PAGE_MASK));
921 /* Fallback to domain selective flush if no PSI support */
922 if (!cap_pgsel_inv(iommu->cap))
923 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
925 non_present_entry_flush);
928 * PSI requires page size to be 2 ^ x, and the base address is naturally
929 * aligned to the size
931 mask = ilog2(__roundup_pow_of_two(pages));
932 /* Fallback to domain selective flush if size is too big */
933 if (mask > cap_max_amask_val(iommu->cap))
934 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
935 DMA_TLB_DSI_FLUSH, non_present_entry_flush);
937 return iommu->flush.flush_iotlb(iommu, did, addr, mask,
939 non_present_entry_flush);
942 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
947 spin_lock_irqsave(&iommu->register_lock, flags);
948 pmen = readl(iommu->reg + DMAR_PMEN_REG);
949 pmen &= ~DMA_PMEN_EPM;
950 writel(pmen, iommu->reg + DMAR_PMEN_REG);
952 /* wait for the protected region status bit to clear */
953 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
954 readl, !(pmen & DMA_PMEN_PRS), pmen);
956 spin_unlock_irqrestore(&iommu->register_lock, flags);
959 static int iommu_enable_translation(struct intel_iommu *iommu)
964 spin_lock_irqsave(&iommu->register_lock, flags);
965 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
967 /* Make sure hardware complete it */
968 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
969 readl, (sts & DMA_GSTS_TES), sts);
971 iommu->gcmd |= DMA_GCMD_TE;
972 spin_unlock_irqrestore(&iommu->register_lock, flags);
976 static int iommu_disable_translation(struct intel_iommu *iommu)
981 spin_lock_irqsave(&iommu->register_lock, flag);
982 iommu->gcmd &= ~DMA_GCMD_TE;
983 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
985 /* Make sure hardware complete it */
986 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
987 readl, (!(sts & DMA_GSTS_TES)), sts);
989 spin_unlock_irqrestore(&iommu->register_lock, flag);
993 /* iommu interrupt handling. Most stuff are MSI-like. */
995 static const char *fault_reason_strings[] =
998 "Present bit in root entry is clear",
999 "Present bit in context entry is clear",
1000 "Invalid context entry",
1001 "Access beyond MGAW",
1002 "PTE Write access is not set",
1003 "PTE Read access is not set",
1004 "Next page table ptr is invalid",
1005 "Root table address invalid",
1006 "Context table ptr is invalid",
1007 "non-zero reserved fields in RTP",
1008 "non-zero reserved fields in CTP",
1009 "non-zero reserved fields in PTE",
1011 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
1013 const char *dmar_get_fault_reason(u8 fault_reason)
1015 if (fault_reason > MAX_FAULT_REASON_IDX)
1018 return fault_reason_strings[fault_reason];
1021 void dmar_msi_unmask(unsigned int irq)
1023 struct intel_iommu *iommu = get_irq_data(irq);
1027 spin_lock_irqsave(&iommu->register_lock, flag);
1028 writel(0, iommu->reg + DMAR_FECTL_REG);
1029 /* Read a reg to force flush the post write */
1030 readl(iommu->reg + DMAR_FECTL_REG);
1031 spin_unlock_irqrestore(&iommu->register_lock, flag);
1034 void dmar_msi_mask(unsigned int irq)
1037 struct intel_iommu *iommu = get_irq_data(irq);
1040 spin_lock_irqsave(&iommu->register_lock, flag);
1041 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1042 /* Read a reg to force flush the post write */
1043 readl(iommu->reg + DMAR_FECTL_REG);
1044 spin_unlock_irqrestore(&iommu->register_lock, flag);
1047 void dmar_msi_write(int irq, struct msi_msg *msg)
1049 struct intel_iommu *iommu = get_irq_data(irq);
1052 spin_lock_irqsave(&iommu->register_lock, flag);
1053 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1054 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1055 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1056 spin_unlock_irqrestore(&iommu->register_lock, flag);
1059 void dmar_msi_read(int irq, struct msi_msg *msg)
1061 struct intel_iommu *iommu = get_irq_data(irq);
1064 spin_lock_irqsave(&iommu->register_lock, flag);
1065 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1066 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1067 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1068 spin_unlock_irqrestore(&iommu->register_lock, flag);
1071 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1072 u8 fault_reason, u16 source_id, unsigned long long addr)
1076 reason = dmar_get_fault_reason(fault_reason);
1079 "DMAR:[%s] Request device [%02x:%02x.%d] "
1080 "fault addr %llx \n"
1081 "DMAR:[fault reason %02d] %s\n",
1082 (type ? "DMA Read" : "DMA Write"),
1083 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1084 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1088 #define PRIMARY_FAULT_REG_LEN (16)
1089 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1091 struct intel_iommu *iommu = dev_id;
1092 int reg, fault_index;
1096 spin_lock_irqsave(&iommu->register_lock, flag);
1097 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1099 /* TBD: ignore advanced fault log currently */
1100 if (!(fault_status & DMA_FSTS_PPF))
1101 goto clear_overflow;
1103 fault_index = dma_fsts_fault_record_index(fault_status);
1104 reg = cap_fault_reg_offset(iommu->cap);
1112 /* highest 32 bits */
1113 data = readl(iommu->reg + reg +
1114 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1115 if (!(data & DMA_FRCD_F))
1118 fault_reason = dma_frcd_fault_reason(data);
1119 type = dma_frcd_type(data);
1121 data = readl(iommu->reg + reg +
1122 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1123 source_id = dma_frcd_source_id(data);
1125 guest_addr = dmar_readq(iommu->reg + reg +
1126 fault_index * PRIMARY_FAULT_REG_LEN);
1127 guest_addr = dma_frcd_page_addr(guest_addr);
1128 /* clear the fault */
1129 writel(DMA_FRCD_F, iommu->reg + reg +
1130 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1132 spin_unlock_irqrestore(&iommu->register_lock, flag);
1134 iommu_page_fault_do_one(iommu, type, fault_reason,
1135 source_id, guest_addr);
1138 if (fault_index > cap_num_fault_regs(iommu->cap))
1140 spin_lock_irqsave(&iommu->register_lock, flag);
1143 /* clear primary fault overflow */
1144 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1145 if (fault_status & DMA_FSTS_PFO)
1146 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1148 spin_unlock_irqrestore(&iommu->register_lock, flag);
1152 int dmar_set_interrupt(struct intel_iommu *iommu)
1158 printk(KERN_ERR "IOMMU: no free vectors\n");
1162 set_irq_data(irq, iommu);
1165 ret = arch_setup_dmar_msi(irq);
1167 set_irq_data(irq, NULL);
1173 /* Force fault register is cleared */
1174 iommu_page_fault(irq, iommu);
1176 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1178 printk(KERN_ERR "IOMMU: can't request irq\n");
1182 static int iommu_init_domains(struct intel_iommu *iommu)
1184 unsigned long ndomains;
1185 unsigned long nlongs;
1187 ndomains = cap_ndoms(iommu->cap);
1188 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1189 nlongs = BITS_TO_LONGS(ndomains);
1191 /* TBD: there might be 64K domains,
1192 * consider other allocation for future chip
1194 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1195 if (!iommu->domain_ids) {
1196 printk(KERN_ERR "Allocating domain id array failed\n");
1199 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1201 if (!iommu->domains) {
1202 printk(KERN_ERR "Allocating domain array failed\n");
1203 kfree(iommu->domain_ids);
1207 spin_lock_init(&iommu->lock);
1210 * if Caching mode is set, then invalid translations are tagged
1211 * with domainid 0. Hence we need to pre-allocate it.
1213 if (cap_caching_mode(iommu->cap))
1214 set_bit(0, iommu->domain_ids);
1219 static void domain_exit(struct dmar_domain *domain);
1220 static void vm_domain_exit(struct dmar_domain *domain);
1222 void free_dmar_iommu(struct intel_iommu *iommu)
1224 struct dmar_domain *domain;
1226 unsigned long flags;
1228 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1229 for (; i < cap_ndoms(iommu->cap); ) {
1230 domain = iommu->domains[i];
1231 clear_bit(i, iommu->domain_ids);
1233 spin_lock_irqsave(&domain->iommu_lock, flags);
1234 if (--domain->iommu_count == 0) {
1235 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1236 vm_domain_exit(domain);
1238 domain_exit(domain);
1240 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1242 i = find_next_bit(iommu->domain_ids,
1243 cap_ndoms(iommu->cap), i+1);
1246 if (iommu->gcmd & DMA_GCMD_TE)
1247 iommu_disable_translation(iommu);
1250 set_irq_data(iommu->irq, NULL);
1251 /* This will mask the irq */
1252 free_irq(iommu->irq, iommu);
1253 destroy_irq(iommu->irq);
1256 kfree(iommu->domains);
1257 kfree(iommu->domain_ids);
1259 g_iommus[iommu->seq_id] = NULL;
1261 /* if all iommus are freed, free g_iommus */
1262 for (i = 0; i < g_num_of_iommus; i++) {
1267 if (i == g_num_of_iommus)
1270 /* free context mapping */
1271 free_context_table(iommu);
1274 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1277 unsigned long ndomains;
1278 struct dmar_domain *domain;
1279 unsigned long flags;
1281 domain = alloc_domain_mem();
1285 ndomains = cap_ndoms(iommu->cap);
1287 spin_lock_irqsave(&iommu->lock, flags);
1288 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1289 if (num >= ndomains) {
1290 spin_unlock_irqrestore(&iommu->lock, flags);
1291 free_domain_mem(domain);
1292 printk(KERN_ERR "IOMMU: no free domain ids\n");
1296 set_bit(num, iommu->domain_ids);
1298 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1299 set_bit(iommu->seq_id, &domain->iommu_bmp);
1301 iommu->domains[num] = domain;
1302 spin_unlock_irqrestore(&iommu->lock, flags);
1307 static void iommu_free_domain(struct dmar_domain *domain)
1309 unsigned long flags;
1310 struct intel_iommu *iommu;
1312 iommu = domain_get_iommu(domain);
1314 spin_lock_irqsave(&iommu->lock, flags);
1315 clear_bit(domain->id, iommu->domain_ids);
1316 spin_unlock_irqrestore(&iommu->lock, flags);
1319 static struct iova_domain reserved_iova_list;
1320 static struct lock_class_key reserved_alloc_key;
1321 static struct lock_class_key reserved_rbtree_key;
1323 static void dmar_init_reserved_ranges(void)
1325 struct pci_dev *pdev = NULL;
1330 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1332 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1333 &reserved_alloc_key);
1334 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1335 &reserved_rbtree_key);
1337 /* IOAPIC ranges shouldn't be accessed by DMA */
1338 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1339 IOVA_PFN(IOAPIC_RANGE_END));
1341 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1343 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1344 for_each_pci_dev(pdev) {
1347 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1348 r = &pdev->resource[i];
1349 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353 size = r->end - addr;
1354 size = PAGE_ALIGN(size);
1355 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1356 IOVA_PFN(size + addr) - 1);
1358 printk(KERN_ERR "Reserve iova failed\n");
1364 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1366 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1369 static inline int guestwidth_to_adjustwidth(int gaw)
1372 int r = (gaw - 12) % 9;
1383 static int domain_init(struct dmar_domain *domain, int guest_width)
1385 struct intel_iommu *iommu;
1386 int adjust_width, agaw;
1387 unsigned long sagaw;
1389 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1390 spin_lock_init(&domain->mapping_lock);
1391 spin_lock_init(&domain->iommu_lock);
1393 domain_reserve_special_ranges(domain);
1395 /* calculate AGAW */
1396 iommu = domain_get_iommu(domain);
1397 if (guest_width > cap_mgaw(iommu->cap))
1398 guest_width = cap_mgaw(iommu->cap);
1399 domain->gaw = guest_width;
1400 adjust_width = guestwidth_to_adjustwidth(guest_width);
1401 agaw = width_to_agaw(adjust_width);
1402 sagaw = cap_sagaw(iommu->cap);
1403 if (!test_bit(agaw, &sagaw)) {
1404 /* hardware doesn't support it, choose a bigger one */
1405 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1406 agaw = find_next_bit(&sagaw, 5, agaw);
1410 domain->agaw = agaw;
1411 INIT_LIST_HEAD(&domain->devices);
1413 if (ecap_coherent(iommu->ecap))
1414 domain->iommu_coherency = 1;
1416 domain->iommu_coherency = 0;
1418 domain->iommu_count = 1;
1420 /* always allocate the top pgd */
1421 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1424 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1428 static void domain_exit(struct dmar_domain *domain)
1432 /* Domain 0 is reserved, so dont process it */
1436 domain_remove_dev_info(domain);
1438 put_iova_domain(&domain->iovad);
1439 end = DOMAIN_MAX_ADDR(domain->gaw);
1440 end = end & (~PAGE_MASK);
1443 dma_pte_clear_range(domain, 0, end);
1445 /* free page tables */
1446 dma_pte_free_pagetable(domain, 0, end);
1448 iommu_free_domain(domain);
1449 free_domain_mem(domain);
1452 static int domain_context_mapping_one(struct dmar_domain *domain,
1455 struct context_entry *context;
1456 unsigned long flags;
1457 struct intel_iommu *iommu;
1458 struct dma_pte *pgd;
1460 unsigned long ndomains;
1464 pr_debug("Set context mapping for %02x:%02x.%d\n",
1465 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1466 BUG_ON(!domain->pgd);
1468 iommu = device_to_iommu(bus, devfn);
1472 context = device_to_context_entry(iommu, bus, devfn);
1475 spin_lock_irqsave(&iommu->lock, flags);
1476 if (context_present(context)) {
1477 spin_unlock_irqrestore(&iommu->lock, flags);
1484 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1487 /* find an available domain id for this device in iommu */
1488 ndomains = cap_ndoms(iommu->cap);
1489 num = find_first_bit(iommu->domain_ids, ndomains);
1490 for (; num < ndomains; ) {
1491 if (iommu->domains[num] == domain) {
1496 num = find_next_bit(iommu->domain_ids,
1497 cap_ndoms(iommu->cap), num+1);
1501 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502 if (num >= ndomains) {
1503 spin_unlock_irqrestore(&iommu->lock, flags);
1504 printk(KERN_ERR "IOMMU: no free domain ids\n");
1508 set_bit(num, iommu->domain_ids);
1509 iommu->domains[num] = domain;
1513 /* Skip top levels of page tables for
1514 * iommu which has less agaw than default.
1516 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1517 pgd = phys_to_virt(dma_pte_addr(pgd));
1518 if (!dma_pte_present(pgd)) {
1519 spin_unlock_irqrestore(&iommu->lock, flags);
1525 context_set_domain_id(context, id);
1526 context_set_address_width(context, iommu->agaw);
1527 context_set_address_root(context, virt_to_phys(pgd));
1528 context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1529 context_set_fault_enable(context);
1530 context_set_present(context);
1531 domain_flush_cache(domain, context, sizeof(*context));
1533 /* it's a non-present to present mapping */
1534 if (iommu->flush.flush_context(iommu, domain->id,
1535 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1536 DMA_CCMD_DEVICE_INVL, 1))
1537 iommu_flush_write_buffer(iommu);
1539 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1541 spin_unlock_irqrestore(&iommu->lock, flags);
1543 spin_lock_irqsave(&domain->iommu_lock, flags);
1544 if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1545 domain->iommu_count++;
1546 domain_update_iommu_coherency(domain);
1548 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1553 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1556 struct pci_dev *tmp, *parent;
1558 ret = domain_context_mapping_one(domain, pdev->bus->number,
1563 /* dependent device mapping */
1564 tmp = pci_find_upstream_pcie_bridge(pdev);
1567 /* Secondary interface's bus number and devfn 0 */
1568 parent = pdev->bus->self;
1569 while (parent != tmp) {
1570 ret = domain_context_mapping_one(domain, parent->bus->number,
1574 parent = parent->bus->self;
1576 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1577 return domain_context_mapping_one(domain,
1578 tmp->subordinate->number, 0);
1579 else /* this is a legacy PCI bridge */
1580 return domain_context_mapping_one(domain,
1581 tmp->bus->number, tmp->devfn);
1584 static int domain_context_mapped(struct pci_dev *pdev)
1587 struct pci_dev *tmp, *parent;
1588 struct intel_iommu *iommu;
1590 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1594 ret = device_context_mapped(iommu,
1595 pdev->bus->number, pdev->devfn);
1598 /* dependent device mapping */
1599 tmp = pci_find_upstream_pcie_bridge(pdev);
1602 /* Secondary interface's bus number and devfn 0 */
1603 parent = pdev->bus->self;
1604 while (parent != tmp) {
1605 ret = device_context_mapped(iommu, parent->bus->number,
1609 parent = parent->bus->self;
1612 return device_context_mapped(iommu,
1613 tmp->subordinate->number, 0);
1615 return device_context_mapped(iommu,
1616 tmp->bus->number, tmp->devfn);
1620 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1621 u64 hpa, size_t size, int prot)
1623 u64 start_pfn, end_pfn;
1624 struct dma_pte *pte;
1626 int addr_width = agaw_to_width(domain->agaw);
1628 hpa &= (((u64)1) << addr_width) - 1;
1630 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1633 start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1634 end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1636 while (start_pfn < end_pfn) {
1637 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1640 /* We don't need lock here, nobody else
1641 * touches the iova range
1643 BUG_ON(dma_pte_addr(pte));
1644 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1645 dma_set_pte_prot(pte, prot);
1646 domain_flush_cache(domain, pte, sizeof(*pte));
1653 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1658 clear_context_table(iommu, bus, devfn);
1659 iommu->flush.flush_context(iommu, 0, 0, 0,
1660 DMA_CCMD_GLOBAL_INVL, 0);
1661 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1662 DMA_TLB_GLOBAL_FLUSH, 0);
1665 static void domain_remove_dev_info(struct dmar_domain *domain)
1667 struct device_domain_info *info;
1668 unsigned long flags;
1669 struct intel_iommu *iommu;
1671 spin_lock_irqsave(&device_domain_lock, flags);
1672 while (!list_empty(&domain->devices)) {
1673 info = list_entry(domain->devices.next,
1674 struct device_domain_info, link);
1675 list_del(&info->link);
1676 list_del(&info->global);
1678 info->dev->dev.archdata.iommu = NULL;
1679 spin_unlock_irqrestore(&device_domain_lock, flags);
1681 iommu = device_to_iommu(info->bus, info->devfn);
1682 iommu_detach_dev(iommu, info->bus, info->devfn);
1683 free_devinfo_mem(info);
1685 spin_lock_irqsave(&device_domain_lock, flags);
1687 spin_unlock_irqrestore(&device_domain_lock, flags);
1692 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1694 static struct dmar_domain *
1695 find_domain(struct pci_dev *pdev)
1697 struct device_domain_info *info;
1699 /* No lock here, assumes no domain exit in normal case */
1700 info = pdev->dev.archdata.iommu;
1702 return info->domain;
1706 /* domain is initialized */
1707 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1709 struct dmar_domain *domain, *found = NULL;
1710 struct intel_iommu *iommu;
1711 struct dmar_drhd_unit *drhd;
1712 struct device_domain_info *info, *tmp;
1713 struct pci_dev *dev_tmp;
1714 unsigned long flags;
1715 int bus = 0, devfn = 0;
1717 domain = find_domain(pdev);
1721 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1723 if (dev_tmp->is_pcie) {
1724 bus = dev_tmp->subordinate->number;
1727 bus = dev_tmp->bus->number;
1728 devfn = dev_tmp->devfn;
1730 spin_lock_irqsave(&device_domain_lock, flags);
1731 list_for_each_entry(info, &device_domain_list, global) {
1732 if (info->bus == bus && info->devfn == devfn) {
1733 found = info->domain;
1737 spin_unlock_irqrestore(&device_domain_lock, flags);
1738 /* pcie-pci bridge already has a domain, uses it */
1745 /* Allocate new domain for the device */
1746 drhd = dmar_find_matched_drhd_unit(pdev);
1748 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1752 iommu = drhd->iommu;
1754 domain = iommu_alloc_domain(iommu);
1758 if (domain_init(domain, gaw)) {
1759 domain_exit(domain);
1763 /* register pcie-to-pci device */
1765 info = alloc_devinfo_mem();
1767 domain_exit(domain);
1771 info->devfn = devfn;
1773 info->domain = domain;
1774 /* This domain is shared by devices under p2p bridge */
1775 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1777 /* pcie-to-pci bridge already has a domain, uses it */
1779 spin_lock_irqsave(&device_domain_lock, flags);
1780 list_for_each_entry(tmp, &device_domain_list, global) {
1781 if (tmp->bus == bus && tmp->devfn == devfn) {
1782 found = tmp->domain;
1787 free_devinfo_mem(info);
1788 domain_exit(domain);
1791 list_add(&info->link, &domain->devices);
1792 list_add(&info->global, &device_domain_list);
1794 spin_unlock_irqrestore(&device_domain_lock, flags);
1798 info = alloc_devinfo_mem();
1801 info->bus = pdev->bus->number;
1802 info->devfn = pdev->devfn;
1804 info->domain = domain;
1805 spin_lock_irqsave(&device_domain_lock, flags);
1806 /* somebody is fast */
1807 found = find_domain(pdev);
1808 if (found != NULL) {
1809 spin_unlock_irqrestore(&device_domain_lock, flags);
1810 if (found != domain) {
1811 domain_exit(domain);
1814 free_devinfo_mem(info);
1817 list_add(&info->link, &domain->devices);
1818 list_add(&info->global, &device_domain_list);
1819 pdev->dev.archdata.iommu = info;
1820 spin_unlock_irqrestore(&device_domain_lock, flags);
1823 /* recheck it here, maybe others set it */
1824 return find_domain(pdev);
1827 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1828 unsigned long long start,
1829 unsigned long long end)
1831 struct dmar_domain *domain;
1833 unsigned long long base;
1837 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1838 pci_name(pdev), start, end);
1839 /* page table init */
1840 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1844 /* The address might not be aligned */
1845 base = start & PAGE_MASK;
1847 size = PAGE_ALIGN(size);
1848 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1849 IOVA_PFN(base + size) - 1)) {
1850 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1855 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1856 size, base, pci_name(pdev));
1858 * RMRR range might have overlap with physical memory range,
1861 dma_pte_clear_range(domain, base, base + size);
1863 ret = domain_page_mapping(domain, base, base, size,
1864 DMA_PTE_READ|DMA_PTE_WRITE);
1868 /* context entry init */
1869 ret = domain_context_mapping(domain, pdev);
1873 domain_exit(domain);
1878 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1879 struct pci_dev *pdev)
1881 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1883 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1884 rmrr->end_address + 1);
1887 #ifdef CONFIG_DMAR_GFX_WA
1888 struct iommu_prepare_data {
1889 struct pci_dev *pdev;
1893 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1894 unsigned long end_pfn, void *datax)
1896 struct iommu_prepare_data *data;
1898 data = (struct iommu_prepare_data *)datax;
1900 data->ret = iommu_prepare_identity_map(data->pdev,
1901 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1906 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1909 struct iommu_prepare_data data;
1914 for_each_online_node(nid) {
1915 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1922 static void __init iommu_prepare_gfx_mapping(void)
1924 struct pci_dev *pdev = NULL;
1927 for_each_pci_dev(pdev) {
1928 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1929 !IS_GFX_DEVICE(pdev))
1931 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1933 ret = iommu_prepare_with_active_regions(pdev);
1935 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1938 #else /* !CONFIG_DMAR_GFX_WA */
1939 static inline void iommu_prepare_gfx_mapping(void)
1945 #ifdef CONFIG_DMAR_FLOPPY_WA
1946 static inline void iommu_prepare_isa(void)
1948 struct pci_dev *pdev;
1951 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1955 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1956 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1959 printk("IOMMU: Failed to create 0-64M identity map, "
1960 "floppy might not work\n");
1964 static inline void iommu_prepare_isa(void)
1968 #endif /* !CONFIG_DMAR_FLPY_WA */
1970 static int __init init_dmars(void)
1972 struct dmar_drhd_unit *drhd;
1973 struct dmar_rmrr_unit *rmrr;
1974 struct pci_dev *pdev;
1975 struct intel_iommu *iommu;
1976 int i, ret, unit = 0;
1981 * initialize and program root entry to not present
1984 for_each_drhd_unit(drhd) {
1987 * lock not needed as this is only incremented in the single
1988 * threaded kernel __init code path all other access are read
1993 g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1996 printk(KERN_ERR "Allocating global iommu array failed\n");
2001 deferred_flush = kzalloc(g_num_of_iommus *
2002 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2003 if (!deferred_flush) {
2009 for_each_drhd_unit(drhd) {
2013 iommu = drhd->iommu;
2014 g_iommus[iommu->seq_id] = iommu;
2016 ret = iommu_init_domains(iommu);
2022 * we could share the same root & context tables
2023 * amoung all IOMMU's. Need to Split it later.
2025 ret = iommu_alloc_root_entry(iommu);
2027 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2032 for_each_drhd_unit(drhd) {
2036 iommu = drhd->iommu;
2037 if (dmar_enable_qi(iommu)) {
2039 * Queued Invalidate not enabled, use Register Based
2042 iommu->flush.flush_context = __iommu_flush_context;
2043 iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2044 printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2046 (unsigned long long)drhd->reg_base_addr);
2048 iommu->flush.flush_context = qi_flush_context;
2049 iommu->flush.flush_iotlb = qi_flush_iotlb;
2050 printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2052 (unsigned long long)drhd->reg_base_addr);
2058 * for each dev attached to rmrr
2060 * locate drhd for dev, alloc domain for dev
2061 * allocate free domain
2062 * allocate page table entries for rmrr
2063 * if context not allocated for bus
2064 * allocate and init context
2065 * set present in root table for this bus
2066 * init context with domain, translation etc
2070 for_each_rmrr_units(rmrr) {
2071 for (i = 0; i < rmrr->devices_cnt; i++) {
2072 pdev = rmrr->devices[i];
2073 /* some BIOS lists non-exist devices in DMAR table */
2076 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2079 "IOMMU: mapping reserved region failed\n");
2083 iommu_prepare_gfx_mapping();
2085 iommu_prepare_isa();
2090 * global invalidate context cache
2091 * global invalidate iotlb
2092 * enable translation
2094 for_each_drhd_unit(drhd) {
2097 iommu = drhd->iommu;
2098 sprintf (iommu->name, "dmar%d", unit++);
2100 iommu_flush_write_buffer(iommu);
2102 ret = dmar_set_interrupt(iommu);
2106 iommu_set_root_entry(iommu);
2108 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2110 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2112 iommu_disable_protect_mem_regions(iommu);
2114 ret = iommu_enable_translation(iommu);
2121 for_each_drhd_unit(drhd) {
2124 iommu = drhd->iommu;
2131 static inline u64 aligned_size(u64 host_addr, size_t size)
2134 addr = (host_addr & (~PAGE_MASK)) + size;
2135 return PAGE_ALIGN(addr);
2139 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2143 /* Make sure it's in range */
2144 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2145 if (!size || (IOVA_START_ADDR + size > end))
2148 piova = alloc_iova(&domain->iovad,
2149 size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2153 static struct iova *
2154 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2155 size_t size, u64 dma_mask)
2157 struct pci_dev *pdev = to_pci_dev(dev);
2158 struct iova *iova = NULL;
2160 if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2161 iova = iommu_alloc_iova(domain, size, dma_mask);
2164 * First try to allocate an io virtual address in
2165 * DMA_32BIT_MASK and if that fails then try allocating
2168 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2170 iova = iommu_alloc_iova(domain, size, dma_mask);
2174 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2181 static struct dmar_domain *
2182 get_valid_domain_for_dev(struct pci_dev *pdev)
2184 struct dmar_domain *domain;
2187 domain = get_domain_for_dev(pdev,
2188 DEFAULT_DOMAIN_ADDRESS_WIDTH);
2191 "Allocating domain for %s failed", pci_name(pdev));
2195 /* make sure context mapping is ok */
2196 if (unlikely(!domain_context_mapped(pdev))) {
2197 ret = domain_context_mapping(domain, pdev);
2200 "Domain context map for %s failed",
2209 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2210 size_t size, int dir, u64 dma_mask)
2212 struct pci_dev *pdev = to_pci_dev(hwdev);
2213 struct dmar_domain *domain;
2214 phys_addr_t start_paddr;
2218 struct intel_iommu *iommu;
2220 BUG_ON(dir == DMA_NONE);
2221 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2224 domain = get_valid_domain_for_dev(pdev);
2228 iommu = domain_get_iommu(domain);
2229 size = aligned_size((u64)paddr, size);
2231 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2235 start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2238 * Check if DMAR supports zero-length reads on write only
2241 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2242 !cap_zlr(iommu->cap))
2243 prot |= DMA_PTE_READ;
2244 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2245 prot |= DMA_PTE_WRITE;
2247 * paddr - (paddr + size) might be partial page, we should map the whole
2248 * page. Note: if two part of one page are separately mapped, we
2249 * might have two guest_addr mapping to the same host paddr, but this
2250 * is not a big problem
2252 ret = domain_page_mapping(domain, start_paddr,
2253 ((u64)paddr) & PAGE_MASK, size, prot);
2257 /* it's a non-present to present mapping */
2258 ret = iommu_flush_iotlb_psi(iommu, domain->id,
2259 start_paddr, size >> VTD_PAGE_SHIFT, 1);
2261 iommu_flush_write_buffer(iommu);
2263 return start_paddr + ((u64)paddr & (~PAGE_MASK));
2267 __free_iova(&domain->iovad, iova);
2268 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2269 pci_name(pdev), size, (unsigned long long)paddr, dir);
2273 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2274 size_t size, int dir)
2276 return __intel_map_single(hwdev, paddr, size, dir,
2277 to_pci_dev(hwdev)->dma_mask);
2280 static void flush_unmaps(void)
2286 /* just flush them all */
2287 for (i = 0; i < g_num_of_iommus; i++) {
2288 struct intel_iommu *iommu = g_iommus[i];
2292 if (deferred_flush[i].next) {
2293 iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2294 DMA_TLB_GLOBAL_FLUSH, 0);
2295 for (j = 0; j < deferred_flush[i].next; j++) {
2296 __free_iova(&deferred_flush[i].domain[j]->iovad,
2297 deferred_flush[i].iova[j]);
2299 deferred_flush[i].next = 0;
2306 static void flush_unmaps_timeout(unsigned long data)
2308 unsigned long flags;
2310 spin_lock_irqsave(&async_umap_flush_lock, flags);
2312 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2315 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2317 unsigned long flags;
2319 struct intel_iommu *iommu;
2321 spin_lock_irqsave(&async_umap_flush_lock, flags);
2322 if (list_size == HIGH_WATER_MARK)
2325 iommu = domain_get_iommu(dom);
2326 iommu_id = iommu->seq_id;
2328 next = deferred_flush[iommu_id].next;
2329 deferred_flush[iommu_id].domain[next] = dom;
2330 deferred_flush[iommu_id].iova[next] = iova;
2331 deferred_flush[iommu_id].next++;
2334 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2338 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2341 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2344 struct pci_dev *pdev = to_pci_dev(dev);
2345 struct dmar_domain *domain;
2346 unsigned long start_addr;
2348 struct intel_iommu *iommu;
2350 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2352 domain = find_domain(pdev);
2355 iommu = domain_get_iommu(domain);
2357 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2361 start_addr = iova->pfn_lo << PAGE_SHIFT;
2362 size = aligned_size((u64)dev_addr, size);
2364 pr_debug("Device %s unmapping: %lx@%llx\n",
2365 pci_name(pdev), size, (unsigned long long)start_addr);
2367 /* clear the whole page */
2368 dma_pte_clear_range(domain, start_addr, start_addr + size);
2369 /* free page tables */
2370 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2371 if (intel_iommu_strict) {
2372 if (iommu_flush_iotlb_psi(iommu,
2373 domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2374 iommu_flush_write_buffer(iommu);
2376 __free_iova(&domain->iovad, iova);
2378 add_unmap(domain, iova);
2380 * queue up the release of the unmap to save the 1/6th of the
2381 * cpu used up by the iotlb flush operation...
2386 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2387 dma_addr_t *dma_handle, gfp_t flags)
2392 size = PAGE_ALIGN(size);
2393 order = get_order(size);
2394 flags &= ~(GFP_DMA | GFP_DMA32);
2396 vaddr = (void *)__get_free_pages(flags, order);
2399 memset(vaddr, 0, size);
2401 *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2403 hwdev->coherent_dma_mask);
2406 free_pages((unsigned long)vaddr, order);
2410 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2411 dma_addr_t dma_handle)
2415 size = PAGE_ALIGN(size);
2416 order = get_order(size);
2418 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2419 free_pages((unsigned long)vaddr, order);
2422 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2424 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2425 int nelems, int dir)
2428 struct pci_dev *pdev = to_pci_dev(hwdev);
2429 struct dmar_domain *domain;
2430 unsigned long start_addr;
2434 struct scatterlist *sg;
2435 struct intel_iommu *iommu;
2437 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2440 domain = find_domain(pdev);
2443 iommu = domain_get_iommu(domain);
2445 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2448 for_each_sg(sglist, sg, nelems, i) {
2449 addr = SG_ENT_VIRT_ADDRESS(sg);
2450 size += aligned_size((u64)addr, sg->length);
2453 start_addr = iova->pfn_lo << PAGE_SHIFT;
2455 /* clear the whole page */
2456 dma_pte_clear_range(domain, start_addr, start_addr + size);
2457 /* free page tables */
2458 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2460 if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2461 size >> VTD_PAGE_SHIFT, 0))
2462 iommu_flush_write_buffer(iommu);
2465 __free_iova(&domain->iovad, iova);
2468 static int intel_nontranslate_map_sg(struct device *hddev,
2469 struct scatterlist *sglist, int nelems, int dir)
2472 struct scatterlist *sg;
2474 for_each_sg(sglist, sg, nelems, i) {
2475 BUG_ON(!sg_page(sg));
2476 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2477 sg->dma_length = sg->length;
2482 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2487 struct pci_dev *pdev = to_pci_dev(hwdev);
2488 struct dmar_domain *domain;
2492 struct iova *iova = NULL;
2494 struct scatterlist *sg;
2495 unsigned long start_addr;
2496 struct intel_iommu *iommu;
2498 BUG_ON(dir == DMA_NONE);
2499 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2500 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2502 domain = get_valid_domain_for_dev(pdev);
2506 iommu = domain_get_iommu(domain);
2508 for_each_sg(sglist, sg, nelems, i) {
2509 addr = SG_ENT_VIRT_ADDRESS(sg);
2510 addr = (void *)virt_to_phys(addr);
2511 size += aligned_size((u64)addr, sg->length);
2514 iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2516 sglist->dma_length = 0;
2521 * Check if DMAR supports zero-length reads on write only
2524 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2525 !cap_zlr(iommu->cap))
2526 prot |= DMA_PTE_READ;
2527 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2528 prot |= DMA_PTE_WRITE;
2530 start_addr = iova->pfn_lo << PAGE_SHIFT;
2532 for_each_sg(sglist, sg, nelems, i) {
2533 addr = SG_ENT_VIRT_ADDRESS(sg);
2534 addr = (void *)virt_to_phys(addr);
2535 size = aligned_size((u64)addr, sg->length);
2536 ret = domain_page_mapping(domain, start_addr + offset,
2537 ((u64)addr) & PAGE_MASK,
2540 /* clear the page */
2541 dma_pte_clear_range(domain, start_addr,
2542 start_addr + offset);
2543 /* free page tables */
2544 dma_pte_free_pagetable(domain, start_addr,
2545 start_addr + offset);
2547 __free_iova(&domain->iovad, iova);
2550 sg->dma_address = start_addr + offset +
2551 ((u64)addr & (~PAGE_MASK));
2552 sg->dma_length = sg->length;
2556 /* it's a non-present to present mapping */
2557 if (iommu_flush_iotlb_psi(iommu, domain->id,
2558 start_addr, offset >> VTD_PAGE_SHIFT, 1))
2559 iommu_flush_write_buffer(iommu);
2563 static struct dma_mapping_ops intel_dma_ops = {
2564 .alloc_coherent = intel_alloc_coherent,
2565 .free_coherent = intel_free_coherent,
2566 .map_single = intel_map_single,
2567 .unmap_single = intel_unmap_single,
2568 .map_sg = intel_map_sg,
2569 .unmap_sg = intel_unmap_sg,
2572 static inline int iommu_domain_cache_init(void)
2576 iommu_domain_cache = kmem_cache_create("iommu_domain",
2577 sizeof(struct dmar_domain),
2582 if (!iommu_domain_cache) {
2583 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2590 static inline int iommu_devinfo_cache_init(void)
2594 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2595 sizeof(struct device_domain_info),
2599 if (!iommu_devinfo_cache) {
2600 printk(KERN_ERR "Couldn't create devinfo cache\n");
2607 static inline int iommu_iova_cache_init(void)
2611 iommu_iova_cache = kmem_cache_create("iommu_iova",
2612 sizeof(struct iova),
2616 if (!iommu_iova_cache) {
2617 printk(KERN_ERR "Couldn't create iova cache\n");
2624 static int __init iommu_init_mempool(void)
2627 ret = iommu_iova_cache_init();
2631 ret = iommu_domain_cache_init();
2635 ret = iommu_devinfo_cache_init();
2639 kmem_cache_destroy(iommu_domain_cache);
2641 kmem_cache_destroy(iommu_iova_cache);
2646 static void __init iommu_exit_mempool(void)
2648 kmem_cache_destroy(iommu_devinfo_cache);
2649 kmem_cache_destroy(iommu_domain_cache);
2650 kmem_cache_destroy(iommu_iova_cache);
2654 static void __init init_no_remapping_devices(void)
2656 struct dmar_drhd_unit *drhd;
2658 for_each_drhd_unit(drhd) {
2659 if (!drhd->include_all) {
2661 for (i = 0; i < drhd->devices_cnt; i++)
2662 if (drhd->devices[i] != NULL)
2664 /* ignore DMAR unit if no pci devices exist */
2665 if (i == drhd->devices_cnt)
2673 for_each_drhd_unit(drhd) {
2675 if (drhd->ignored || drhd->include_all)
2678 for (i = 0; i < drhd->devices_cnt; i++)
2679 if (drhd->devices[i] &&
2680 !IS_GFX_DEVICE(drhd->devices[i]))
2683 if (i < drhd->devices_cnt)
2686 /* bypass IOMMU if it is just for gfx devices */
2688 for (i = 0; i < drhd->devices_cnt; i++) {
2689 if (!drhd->devices[i])
2691 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2696 int __init intel_iommu_init(void)
2700 if (dmar_table_init())
2703 if (dmar_dev_scope_init())
2707 * Check the need for DMA-remapping initialization now.
2708 * Above initialization will also be used by Interrupt-remapping.
2710 if (no_iommu || swiotlb || dmar_disabled)
2713 iommu_init_mempool();
2714 dmar_init_reserved_ranges();
2716 init_no_remapping_devices();
2720 printk(KERN_ERR "IOMMU: dmar init failed\n");
2721 put_iova_domain(&reserved_iova_list);
2722 iommu_exit_mempool();
2726 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2728 init_timer(&unmap_timer);
2730 dma_ops = &intel_dma_ops;
2734 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2735 struct pci_dev *pdev)
2737 struct device_domain_info *info;
2738 unsigned long flags;
2740 info = alloc_devinfo_mem();
2744 info->bus = pdev->bus->number;
2745 info->devfn = pdev->devfn;
2747 info->domain = domain;
2749 spin_lock_irqsave(&device_domain_lock, flags);
2750 list_add(&info->link, &domain->devices);
2751 list_add(&info->global, &device_domain_list);
2752 pdev->dev.archdata.iommu = info;
2753 spin_unlock_irqrestore(&device_domain_lock, flags);
2758 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2759 struct pci_dev *pdev)
2761 struct device_domain_info *info;
2762 struct intel_iommu *iommu;
2763 unsigned long flags;
2765 struct list_head *entry, *tmp;
2767 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2771 spin_lock_irqsave(&device_domain_lock, flags);
2772 list_for_each_safe(entry, tmp, &domain->devices) {
2773 info = list_entry(entry, struct device_domain_info, link);
2774 if (info->bus == pdev->bus->number &&
2775 info->devfn == pdev->devfn) {
2776 list_del(&info->link);
2777 list_del(&info->global);
2779 info->dev->dev.archdata.iommu = NULL;
2780 spin_unlock_irqrestore(&device_domain_lock, flags);
2782 iommu_detach_dev(iommu, info->bus, info->devfn);
2783 free_devinfo_mem(info);
2785 spin_lock_irqsave(&device_domain_lock, flags);
2793 /* if there is no other devices under the same iommu
2794 * owned by this domain, clear this iommu in iommu_bmp
2795 * update iommu count and coherency
2797 if (device_to_iommu(info->bus, info->devfn) == iommu)
2802 unsigned long tmp_flags;
2803 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2804 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2805 domain->iommu_count--;
2806 domain_update_iommu_coherency(domain);
2807 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2810 spin_unlock_irqrestore(&device_domain_lock, flags);
2813 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2815 struct device_domain_info *info;
2816 struct intel_iommu *iommu;
2817 unsigned long flags1, flags2;
2819 spin_lock_irqsave(&device_domain_lock, flags1);
2820 while (!list_empty(&domain->devices)) {
2821 info = list_entry(domain->devices.next,
2822 struct device_domain_info, link);
2823 list_del(&info->link);
2824 list_del(&info->global);
2826 info->dev->dev.archdata.iommu = NULL;
2828 spin_unlock_irqrestore(&device_domain_lock, flags1);
2830 iommu = device_to_iommu(info->bus, info->devfn);
2831 iommu_detach_dev(iommu, info->bus, info->devfn);
2833 /* clear this iommu in iommu_bmp, update iommu count
2836 spin_lock_irqsave(&domain->iommu_lock, flags2);
2837 if (test_and_clear_bit(iommu->seq_id,
2838 &domain->iommu_bmp)) {
2839 domain->iommu_count--;
2840 domain_update_iommu_coherency(domain);
2842 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2844 free_devinfo_mem(info);
2845 spin_lock_irqsave(&device_domain_lock, flags1);
2847 spin_unlock_irqrestore(&device_domain_lock, flags1);
2850 /* domain id for virtual machine, it won't be set in context */
2851 static unsigned long vm_domid;
2853 static int vm_domain_min_agaw(struct dmar_domain *domain)
2856 int min_agaw = domain->agaw;
2858 i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2859 for (; i < g_num_of_iommus; ) {
2860 if (min_agaw > g_iommus[i]->agaw)
2861 min_agaw = g_iommus[i]->agaw;
2863 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2869 static struct dmar_domain *iommu_alloc_vm_domain(void)
2871 struct dmar_domain *domain;
2873 domain = alloc_domain_mem();
2877 domain->id = vm_domid++;
2878 memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2879 domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2884 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2888 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2889 spin_lock_init(&domain->mapping_lock);
2890 spin_lock_init(&domain->iommu_lock);
2892 domain_reserve_special_ranges(domain);
2894 /* calculate AGAW */
2895 domain->gaw = guest_width;
2896 adjust_width = guestwidth_to_adjustwidth(guest_width);
2897 domain->agaw = width_to_agaw(adjust_width);
2899 INIT_LIST_HEAD(&domain->devices);
2901 domain->iommu_count = 0;
2902 domain->iommu_coherency = 0;
2903 domain->max_addr = 0;
2905 /* always allocate the top pgd */
2906 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2909 domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2913 static void iommu_free_vm_domain(struct dmar_domain *domain)
2915 unsigned long flags;
2916 struct dmar_drhd_unit *drhd;
2917 struct intel_iommu *iommu;
2919 unsigned long ndomains;
2921 for_each_drhd_unit(drhd) {
2924 iommu = drhd->iommu;
2926 ndomains = cap_ndoms(iommu->cap);
2927 i = find_first_bit(iommu->domain_ids, ndomains);
2928 for (; i < ndomains; ) {
2929 if (iommu->domains[i] == domain) {
2930 spin_lock_irqsave(&iommu->lock, flags);
2931 clear_bit(i, iommu->domain_ids);
2932 iommu->domains[i] = NULL;
2933 spin_unlock_irqrestore(&iommu->lock, flags);
2936 i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2941 static void vm_domain_exit(struct dmar_domain *domain)
2945 /* Domain 0 is reserved, so dont process it */
2949 vm_domain_remove_all_dev_info(domain);
2951 put_iova_domain(&domain->iovad);
2952 end = DOMAIN_MAX_ADDR(domain->gaw);
2953 end = end & (~VTD_PAGE_MASK);
2956 dma_pte_clear_range(domain, 0, end);
2958 /* free page tables */
2959 dma_pte_free_pagetable(domain, 0, end);
2961 iommu_free_vm_domain(domain);
2962 free_domain_mem(domain);
2965 struct dmar_domain *intel_iommu_alloc_domain(void)
2967 struct dmar_domain *domain;
2969 domain = iommu_alloc_vm_domain();
2972 "intel_iommu_domain_alloc: domain == NULL\n");
2975 if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2977 "intel_iommu_domain_alloc: domain_init() failed\n");
2978 vm_domain_exit(domain);
2984 EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
2986 void intel_iommu_free_domain(struct dmar_domain *domain)
2988 vm_domain_exit(domain);
2990 EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
2992 int intel_iommu_attach_device(struct dmar_domain *domain,
2993 struct pci_dev *pdev)
2995 struct intel_iommu *iommu;
3000 /* normally pdev is not mapped */
3001 if (unlikely(domain_context_mapped(pdev))) {
3002 struct dmar_domain *old_domain;
3004 old_domain = find_domain(pdev);
3006 if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3007 vm_domain_remove_one_dev_info(old_domain, pdev);
3009 domain_remove_dev_info(old_domain);
3013 iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3017 /* check if this iommu agaw is sufficient for max mapped address */
3018 addr_width = agaw_to_width(iommu->agaw);
3019 end = DOMAIN_MAX_ADDR(addr_width);
3020 end = end & VTD_PAGE_MASK;
3021 if (end < domain->max_addr) {
3022 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3023 "sufficient for the mapped address (%llx)\n",
3024 __func__, iommu->agaw, domain->max_addr);
3028 ret = domain_context_mapping(domain, pdev);
3032 ret = vm_domain_add_dev_info(domain, pdev);
3035 EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
3037 void intel_iommu_detach_device(struct dmar_domain *domain,
3038 struct pci_dev *pdev)
3040 vm_domain_remove_one_dev_info(domain, pdev);
3042 EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
3044 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
3045 u64 hpa, size_t size, int prot)
3051 max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3052 if (domain->max_addr < max_addr) {
3056 /* check if minimum agaw is sufficient for mapped address */
3057 min_agaw = vm_domain_min_agaw(domain);
3058 addr_width = agaw_to_width(min_agaw);
3059 end = DOMAIN_MAX_ADDR(addr_width);
3060 end = end & VTD_PAGE_MASK;
3061 if (end < max_addr) {
3062 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3063 "sufficient for the mapped address (%llx)\n",
3064 __func__, min_agaw, max_addr);
3067 domain->max_addr = max_addr;
3070 ret = domain_page_mapping(domain, iova, hpa, size, prot);
3073 EXPORT_SYMBOL_GPL(intel_iommu_map_address);
3075 void intel_iommu_unmap_address(struct dmar_domain *domain,
3076 dma_addr_t iova, size_t size)
3080 /* The address might not be aligned */
3081 base = iova & VTD_PAGE_MASK;
3082 size = VTD_PAGE_ALIGN(size);
3083 dma_pte_clear_range(domain, base, base + size);
3085 if (domain->max_addr == base + size)
3086 domain->max_addr = base;
3088 EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
3090 int intel_iommu_found(void)
3092 return g_num_of_iommus;
3094 EXPORT_SYMBOL_GPL(intel_iommu_found);
3096 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
3098 struct dma_pte *pte;
3101 pte = addr_to_dma_pte(domain, iova);
3103 phys = dma_pte_addr(pte);
3107 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);