2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
46 #define IOAPIC_RANGE_START (0xfee00000)
47 #define IOAPIC_RANGE_END (0xfeefffff)
48 #define IOVA_START_ADDR (0x1000)
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55 static void flush_unmaps_timeout(unsigned long data);
57 DEFINE_TIMER(unmap_timer, flush_unmaps_timeout, 0, 0);
59 #define HIGH_WATER_MARK 250
60 struct deferred_flush_tables {
62 struct iova *iova[HIGH_WATER_MARK];
63 struct dmar_domain *domain[HIGH_WATER_MARK];
66 static struct deferred_flush_tables *deferred_flush;
68 /* bitmap for indexing intel_iommus */
69 static int g_num_of_iommus;
71 static DEFINE_SPINLOCK(async_umap_flush_lock);
72 static LIST_HEAD(unmaps_to_do);
75 static long list_size;
77 static void domain_remove_dev_info(struct dmar_domain *domain);
80 static int __initdata dmar_map_gfx = 1;
81 static int dmar_forcedac;
82 static int intel_iommu_strict;
84 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
85 static DEFINE_SPINLOCK(device_domain_lock);
86 static LIST_HEAD(device_domain_list);
88 static int __init intel_iommu_setup(char *str)
93 if (!strncmp(str, "off", 3)) {
95 printk(KERN_INFO"Intel-IOMMU: disabled\n");
96 } else if (!strncmp(str, "igfx_off", 8)) {
99 "Intel-IOMMU: disable GFX device mapping\n");
100 } else if (!strncmp(str, "forcedac", 8)) {
102 "Intel-IOMMU: Forcing DAC for PCI devices\n");
104 } else if (!strncmp(str, "strict", 6)) {
106 "Intel-IOMMU: disable batched IOTLB flush\n");
107 intel_iommu_strict = 1;
110 str += strcspn(str, ",");
116 __setup("intel_iommu=", intel_iommu_setup);
118 static struct kmem_cache *iommu_domain_cache;
119 static struct kmem_cache *iommu_devinfo_cache;
120 static struct kmem_cache *iommu_iova_cache;
122 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
127 /* trying to avoid low memory issues */
128 flags = current->flags & PF_MEMALLOC;
129 current->flags |= PF_MEMALLOC;
130 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
131 current->flags &= (~PF_MEMALLOC | flags);
136 static inline void *alloc_pgtable_page(void)
141 /* trying to avoid low memory issues */
142 flags = current->flags & PF_MEMALLOC;
143 current->flags |= PF_MEMALLOC;
144 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
145 current->flags &= (~PF_MEMALLOC | flags);
149 static inline void free_pgtable_page(void *vaddr)
151 free_page((unsigned long)vaddr);
154 static inline void *alloc_domain_mem(void)
156 return iommu_kmem_cache_alloc(iommu_domain_cache);
159 static inline void free_domain_mem(void *vaddr)
161 kmem_cache_free(iommu_domain_cache, vaddr);
164 static inline void * alloc_devinfo_mem(void)
166 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
169 static inline void free_devinfo_mem(void *vaddr)
171 kmem_cache_free(iommu_devinfo_cache, vaddr);
174 struct iova *alloc_iova_mem(void)
176 return iommu_kmem_cache_alloc(iommu_iova_cache);
179 void free_iova_mem(struct iova *iova)
181 kmem_cache_free(iommu_iova_cache, iova);
184 /* Gets context entry for a given bus and devfn */
185 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
188 struct root_entry *root;
189 struct context_entry *context;
190 unsigned long phy_addr;
193 spin_lock_irqsave(&iommu->lock, flags);
194 root = &iommu->root_entry[bus];
195 context = get_context_addr_from_root(root);
197 context = (struct context_entry *)alloc_pgtable_page();
199 spin_unlock_irqrestore(&iommu->lock, flags);
202 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
203 phy_addr = virt_to_phys((void *)context);
204 set_root_value(root, phy_addr);
205 set_root_present(root);
206 __iommu_flush_cache(iommu, root, sizeof(*root));
208 spin_unlock_irqrestore(&iommu->lock, flags);
209 return &context[devfn];
212 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
214 struct root_entry *root;
215 struct context_entry *context;
219 spin_lock_irqsave(&iommu->lock, flags);
220 root = &iommu->root_entry[bus];
221 context = get_context_addr_from_root(root);
226 ret = context_present(context[devfn]);
228 spin_unlock_irqrestore(&iommu->lock, flags);
232 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
234 struct root_entry *root;
235 struct context_entry *context;
238 spin_lock_irqsave(&iommu->lock, flags);
239 root = &iommu->root_entry[bus];
240 context = get_context_addr_from_root(root);
242 context_clear_entry(context[devfn]);
243 __iommu_flush_cache(iommu, &context[devfn], \
246 spin_unlock_irqrestore(&iommu->lock, flags);
249 static void free_context_table(struct intel_iommu *iommu)
251 struct root_entry *root;
254 struct context_entry *context;
256 spin_lock_irqsave(&iommu->lock, flags);
257 if (!iommu->root_entry) {
260 for (i = 0; i < ROOT_ENTRY_NR; i++) {
261 root = &iommu->root_entry[i];
262 context = get_context_addr_from_root(root);
264 free_pgtable_page(context);
266 free_pgtable_page(iommu->root_entry);
267 iommu->root_entry = NULL;
269 spin_unlock_irqrestore(&iommu->lock, flags);
272 /* page table handling */
273 #define LEVEL_STRIDE (9)
274 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
276 static inline int agaw_to_level(int agaw)
281 static inline int agaw_to_width(int agaw)
283 return 30 + agaw * LEVEL_STRIDE;
287 static inline int width_to_agaw(int width)
289 return (width - 30) / LEVEL_STRIDE;
292 static inline unsigned int level_to_offset_bits(int level)
294 return (12 + (level - 1) * LEVEL_STRIDE);
297 static inline int address_level_offset(u64 addr, int level)
299 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
302 static inline u64 level_mask(int level)
304 return ((u64)-1 << level_to_offset_bits(level));
307 static inline u64 level_size(int level)
309 return ((u64)1 << level_to_offset_bits(level));
312 static inline u64 align_to_level(u64 addr, int level)
314 return ((addr + level_size(level) - 1) & level_mask(level));
317 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
319 int addr_width = agaw_to_width(domain->agaw);
320 struct dma_pte *parent, *pte = NULL;
321 int level = agaw_to_level(domain->agaw);
325 BUG_ON(!domain->pgd);
327 addr &= (((u64)1) << addr_width) - 1;
328 parent = domain->pgd;
330 spin_lock_irqsave(&domain->mapping_lock, flags);
334 offset = address_level_offset(addr, level);
335 pte = &parent[offset];
339 if (!dma_pte_present(*pte)) {
340 tmp_page = alloc_pgtable_page();
343 spin_unlock_irqrestore(&domain->mapping_lock,
347 __iommu_flush_cache(domain->iommu, tmp_page,
349 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
351 * high level table always sets r/w, last level page
352 * table control read/write
354 dma_set_pte_readable(*pte);
355 dma_set_pte_writable(*pte);
356 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
358 parent = phys_to_virt(dma_pte_addr(*pte));
362 spin_unlock_irqrestore(&domain->mapping_lock, flags);
366 /* return address's pte at specific level */
367 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
370 struct dma_pte *parent, *pte = NULL;
371 int total = agaw_to_level(domain->agaw);
374 parent = domain->pgd;
375 while (level <= total) {
376 offset = address_level_offset(addr, total);
377 pte = &parent[offset];
381 if (!dma_pte_present(*pte))
383 parent = phys_to_virt(dma_pte_addr(*pte));
389 /* clear one page's page table */
390 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
392 struct dma_pte *pte = NULL;
394 /* get last level pte */
395 pte = dma_addr_level_pte(domain, addr, 1);
399 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
403 /* clear last level pte, a tlb flush should be followed */
404 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
406 int addr_width = agaw_to_width(domain->agaw);
408 start &= (((u64)1) << addr_width) - 1;
409 end &= (((u64)1) << addr_width) - 1;
410 /* in case it's partial page */
411 start = PAGE_ALIGN_4K(start);
414 /* we don't need lock here, nobody else touches the iova range */
415 while (start < end) {
416 dma_pte_clear_one(domain, start);
417 start += PAGE_SIZE_4K;
421 /* free page table pages. last level pte should already be cleared */
422 static void dma_pte_free_pagetable(struct dmar_domain *domain,
425 int addr_width = agaw_to_width(domain->agaw);
427 int total = agaw_to_level(domain->agaw);
431 start &= (((u64)1) << addr_width) - 1;
432 end &= (((u64)1) << addr_width) - 1;
434 /* we don't need lock here, nobody else touches the iova range */
436 while (level <= total) {
437 tmp = align_to_level(start, level);
438 if (tmp >= end || (tmp + level_size(level) > end))
442 pte = dma_addr_level_pte(domain, tmp, level);
445 phys_to_virt(dma_pte_addr(*pte)));
447 __iommu_flush_cache(domain->iommu,
450 tmp += level_size(level);
455 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
456 free_pgtable_page(domain->pgd);
462 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
464 struct root_entry *root;
467 root = (struct root_entry *)alloc_pgtable_page();
471 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
473 spin_lock_irqsave(&iommu->lock, flags);
474 iommu->root_entry = root;
475 spin_unlock_irqrestore(&iommu->lock, flags);
480 static void iommu_set_root_entry(struct intel_iommu *iommu)
486 addr = iommu->root_entry;
488 spin_lock_irqsave(&iommu->register_lock, flag);
489 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
491 cmd = iommu->gcmd | DMA_GCMD_SRTP;
492 writel(cmd, iommu->reg + DMAR_GCMD_REG);
494 /* Make sure hardware complete it */
495 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
496 readl, (sts & DMA_GSTS_RTPS), sts);
498 spin_unlock_irqrestore(&iommu->register_lock, flag);
501 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
506 if (!cap_rwbf(iommu->cap))
508 val = iommu->gcmd | DMA_GCMD_WBF;
510 spin_lock_irqsave(&iommu->register_lock, flag);
511 writel(val, iommu->reg + DMAR_GCMD_REG);
513 /* Make sure hardware complete it */
514 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515 readl, (!(val & DMA_GSTS_WBFS)), val);
517 spin_unlock_irqrestore(&iommu->register_lock, flag);
520 /* return value determine if we need a write buffer flush */
521 static int __iommu_flush_context(struct intel_iommu *iommu,
522 u16 did, u16 source_id, u8 function_mask, u64 type,
523 int non_present_entry_flush)
529 * In the non-present entry flush case, if hardware doesn't cache
530 * non-present entry we do nothing and if hardware cache non-present
531 * entry, we flush entries of domain 0 (the domain id is used to cache
532 * any non-present entries)
534 if (non_present_entry_flush) {
535 if (!cap_caching_mode(iommu->cap))
542 case DMA_CCMD_GLOBAL_INVL:
543 val = DMA_CCMD_GLOBAL_INVL;
545 case DMA_CCMD_DOMAIN_INVL:
546 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
548 case DMA_CCMD_DEVICE_INVL:
549 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
550 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
557 spin_lock_irqsave(&iommu->register_lock, flag);
558 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
560 /* Make sure hardware complete it */
561 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
562 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
564 spin_unlock_irqrestore(&iommu->register_lock, flag);
566 /* flush context entry will implictly flush write buffer */
570 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
571 int non_present_entry_flush)
573 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
574 non_present_entry_flush);
577 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
578 int non_present_entry_flush)
580 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
581 non_present_entry_flush);
584 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
585 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
587 return __iommu_flush_context(iommu, did, source_id, function_mask,
588 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
591 /* return value determine if we need a write buffer flush */
592 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
593 u64 addr, unsigned int size_order, u64 type,
594 int non_present_entry_flush)
596 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
597 u64 val = 0, val_iva = 0;
601 * In the non-present entry flush case, if hardware doesn't cache
602 * non-present entry we do nothing and if hardware cache non-present
603 * entry, we flush entries of domain 0 (the domain id is used to cache
604 * any non-present entries)
606 if (non_present_entry_flush) {
607 if (!cap_caching_mode(iommu->cap))
614 case DMA_TLB_GLOBAL_FLUSH:
615 /* global flush doesn't need set IVA_REG */
616 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
618 case DMA_TLB_DSI_FLUSH:
619 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
621 case DMA_TLB_PSI_FLUSH:
622 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
623 /* Note: always flush non-leaf currently */
624 val_iva = size_order | addr;
629 /* Note: set drain read/write */
632 * This is probably to be super secure.. Looks like we can
633 * ignore it without any impact.
635 if (cap_read_drain(iommu->cap))
636 val |= DMA_TLB_READ_DRAIN;
638 if (cap_write_drain(iommu->cap))
639 val |= DMA_TLB_WRITE_DRAIN;
641 spin_lock_irqsave(&iommu->register_lock, flag);
642 /* Note: Only uses first TLB reg currently */
644 dmar_writeq(iommu->reg + tlb_offset, val_iva);
645 dmar_writeq(iommu->reg + tlb_offset + 8, val);
647 /* Make sure hardware complete it */
648 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
649 dmar_readq, (!(val & DMA_TLB_IVT)), val);
651 spin_unlock_irqrestore(&iommu->register_lock, flag);
653 /* check IOTLB invalidation granularity */
654 if (DMA_TLB_IAIG(val) == 0)
655 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
656 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
657 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
658 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
659 /* flush context entry will implictly flush write buffer */
663 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
664 int non_present_entry_flush)
666 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
667 non_present_entry_flush);
670 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
671 int non_present_entry_flush)
673 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
674 non_present_entry_flush);
677 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
678 u64 addr, unsigned int pages, int non_present_entry_flush)
682 BUG_ON(addr & (~PAGE_MASK_4K));
685 /* Fallback to domain selective flush if no PSI support */
686 if (!cap_pgsel_inv(iommu->cap))
687 return iommu_flush_iotlb_dsi(iommu, did,
688 non_present_entry_flush);
691 * PSI requires page size to be 2 ^ x, and the base address is naturally
692 * aligned to the size
694 mask = ilog2(__roundup_pow_of_two(pages));
695 /* Fallback to domain selective flush if size is too big */
696 if (mask > cap_max_amask_val(iommu->cap))
697 return iommu_flush_iotlb_dsi(iommu, did,
698 non_present_entry_flush);
700 return __iommu_flush_iotlb(iommu, did, addr, mask,
701 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
704 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
709 spin_lock_irqsave(&iommu->register_lock, flags);
710 pmen = readl(iommu->reg + DMAR_PMEN_REG);
711 pmen &= ~DMA_PMEN_EPM;
712 writel(pmen, iommu->reg + DMAR_PMEN_REG);
714 /* wait for the protected region status bit to clear */
715 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
716 readl, !(pmen & DMA_PMEN_PRS), pmen);
718 spin_unlock_irqrestore(&iommu->register_lock, flags);
721 static int iommu_enable_translation(struct intel_iommu *iommu)
726 spin_lock_irqsave(&iommu->register_lock, flags);
727 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
729 /* Make sure hardware complete it */
730 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
731 readl, (sts & DMA_GSTS_TES), sts);
733 iommu->gcmd |= DMA_GCMD_TE;
734 spin_unlock_irqrestore(&iommu->register_lock, flags);
738 static int iommu_disable_translation(struct intel_iommu *iommu)
743 spin_lock_irqsave(&iommu->register_lock, flag);
744 iommu->gcmd &= ~DMA_GCMD_TE;
745 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
747 /* Make sure hardware complete it */
748 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
749 readl, (!(sts & DMA_GSTS_TES)), sts);
751 spin_unlock_irqrestore(&iommu->register_lock, flag);
755 /* iommu interrupt handling. Most stuff are MSI-like. */
757 static const char *fault_reason_strings[] =
760 "Present bit in root entry is clear",
761 "Present bit in context entry is clear",
762 "Invalid context entry",
763 "Access beyond MGAW",
764 "PTE Write access is not set",
765 "PTE Read access is not set",
766 "Next page table ptr is invalid",
767 "Root table address invalid",
768 "Context table ptr is invalid",
769 "non-zero reserved fields in RTP",
770 "non-zero reserved fields in CTP",
771 "non-zero reserved fields in PTE",
773 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
775 const char *dmar_get_fault_reason(u8 fault_reason)
777 if (fault_reason > MAX_FAULT_REASON_IDX)
780 return fault_reason_strings[fault_reason];
783 void dmar_msi_unmask(unsigned int irq)
785 struct intel_iommu *iommu = get_irq_data(irq);
789 spin_lock_irqsave(&iommu->register_lock, flag);
790 writel(0, iommu->reg + DMAR_FECTL_REG);
791 /* Read a reg to force flush the post write */
792 readl(iommu->reg + DMAR_FECTL_REG);
793 spin_unlock_irqrestore(&iommu->register_lock, flag);
796 void dmar_msi_mask(unsigned int irq)
799 struct intel_iommu *iommu = get_irq_data(irq);
802 spin_lock_irqsave(&iommu->register_lock, flag);
803 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
804 /* Read a reg to force flush the post write */
805 readl(iommu->reg + DMAR_FECTL_REG);
806 spin_unlock_irqrestore(&iommu->register_lock, flag);
809 void dmar_msi_write(int irq, struct msi_msg *msg)
811 struct intel_iommu *iommu = get_irq_data(irq);
814 spin_lock_irqsave(&iommu->register_lock, flag);
815 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
816 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
817 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
818 spin_unlock_irqrestore(&iommu->register_lock, flag);
821 void dmar_msi_read(int irq, struct msi_msg *msg)
823 struct intel_iommu *iommu = get_irq_data(irq);
826 spin_lock_irqsave(&iommu->register_lock, flag);
827 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
828 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
829 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
830 spin_unlock_irqrestore(&iommu->register_lock, flag);
833 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
834 u8 fault_reason, u16 source_id, u64 addr)
838 reason = dmar_get_fault_reason(fault_reason);
841 "DMAR:[%s] Request device [%02x:%02x.%d] "
843 "DMAR:[fault reason %02d] %s\n",
844 (type ? "DMA Read" : "DMA Write"),
845 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
846 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
850 #define PRIMARY_FAULT_REG_LEN (16)
851 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
853 struct intel_iommu *iommu = dev_id;
854 int reg, fault_index;
858 spin_lock_irqsave(&iommu->register_lock, flag);
859 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
861 /* TBD: ignore advanced fault log currently */
862 if (!(fault_status & DMA_FSTS_PPF))
865 fault_index = dma_fsts_fault_record_index(fault_status);
866 reg = cap_fault_reg_offset(iommu->cap);
874 /* highest 32 bits */
875 data = readl(iommu->reg + reg +
876 fault_index * PRIMARY_FAULT_REG_LEN + 12);
877 if (!(data & DMA_FRCD_F))
880 fault_reason = dma_frcd_fault_reason(data);
881 type = dma_frcd_type(data);
883 data = readl(iommu->reg + reg +
884 fault_index * PRIMARY_FAULT_REG_LEN + 8);
885 source_id = dma_frcd_source_id(data);
887 guest_addr = dmar_readq(iommu->reg + reg +
888 fault_index * PRIMARY_FAULT_REG_LEN);
889 guest_addr = dma_frcd_page_addr(guest_addr);
890 /* clear the fault */
891 writel(DMA_FRCD_F, iommu->reg + reg +
892 fault_index * PRIMARY_FAULT_REG_LEN + 12);
894 spin_unlock_irqrestore(&iommu->register_lock, flag);
896 iommu_page_fault_do_one(iommu, type, fault_reason,
897 source_id, guest_addr);
900 if (fault_index > cap_num_fault_regs(iommu->cap))
902 spin_lock_irqsave(&iommu->register_lock, flag);
905 /* clear primary fault overflow */
906 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
907 if (fault_status & DMA_FSTS_PFO)
908 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
910 spin_unlock_irqrestore(&iommu->register_lock, flag);
914 int dmar_set_interrupt(struct intel_iommu *iommu)
920 printk(KERN_ERR "IOMMU: no free vectors\n");
924 set_irq_data(irq, iommu);
927 ret = arch_setup_dmar_msi(irq);
929 set_irq_data(irq, NULL);
935 /* Force fault register is cleared */
936 iommu_page_fault(irq, iommu);
938 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
940 printk(KERN_ERR "IOMMU: can't request irq\n");
944 static int iommu_init_domains(struct intel_iommu *iommu)
946 unsigned long ndomains;
947 unsigned long nlongs;
949 ndomains = cap_ndoms(iommu->cap);
950 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
951 nlongs = BITS_TO_LONGS(ndomains);
953 /* TBD: there might be 64K domains,
954 * consider other allocation for future chip
956 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
957 if (!iommu->domain_ids) {
958 printk(KERN_ERR "Allocating domain id array failed\n");
961 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
963 if (!iommu->domains) {
964 printk(KERN_ERR "Allocating domain array failed\n");
965 kfree(iommu->domain_ids);
969 spin_lock_init(&iommu->lock);
972 * if Caching mode is set, then invalid translations are tagged
973 * with domainid 0. Hence we need to pre-allocate it.
975 if (cap_caching_mode(iommu->cap))
976 set_bit(0, iommu->domain_ids);
981 static void domain_exit(struct dmar_domain *domain);
983 void free_dmar_iommu(struct intel_iommu *iommu)
985 struct dmar_domain *domain;
988 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
989 for (; i < cap_ndoms(iommu->cap); ) {
990 domain = iommu->domains[i];
991 clear_bit(i, iommu->domain_ids);
993 i = find_next_bit(iommu->domain_ids,
994 cap_ndoms(iommu->cap), i+1);
997 if (iommu->gcmd & DMA_GCMD_TE)
998 iommu_disable_translation(iommu);
1001 set_irq_data(iommu->irq, NULL);
1002 /* This will mask the irq */
1003 free_irq(iommu->irq, iommu);
1004 destroy_irq(iommu->irq);
1007 kfree(iommu->domains);
1008 kfree(iommu->domain_ids);
1010 /* free context mapping */
1011 free_context_table(iommu);
1014 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1017 unsigned long ndomains;
1018 struct dmar_domain *domain;
1019 unsigned long flags;
1021 domain = alloc_domain_mem();
1025 ndomains = cap_ndoms(iommu->cap);
1027 spin_lock_irqsave(&iommu->lock, flags);
1028 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1029 if (num >= ndomains) {
1030 spin_unlock_irqrestore(&iommu->lock, flags);
1031 free_domain_mem(domain);
1032 printk(KERN_ERR "IOMMU: no free domain ids\n");
1036 set_bit(num, iommu->domain_ids);
1038 domain->iommu = iommu;
1039 iommu->domains[num] = domain;
1040 spin_unlock_irqrestore(&iommu->lock, flags);
1045 static void iommu_free_domain(struct dmar_domain *domain)
1047 unsigned long flags;
1049 spin_lock_irqsave(&domain->iommu->lock, flags);
1050 clear_bit(domain->id, domain->iommu->domain_ids);
1051 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1054 static struct iova_domain reserved_iova_list;
1055 static struct lock_class_key reserved_alloc_key;
1056 static struct lock_class_key reserved_rbtree_key;
1058 static void dmar_init_reserved_ranges(void)
1060 struct pci_dev *pdev = NULL;
1065 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1067 lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1068 &reserved_alloc_key);
1069 lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1070 &reserved_rbtree_key);
1072 /* IOAPIC ranges shouldn't be accessed by DMA */
1073 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1074 IOVA_PFN(IOAPIC_RANGE_END));
1076 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1078 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1079 for_each_pci_dev(pdev) {
1082 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1083 r = &pdev->resource[i];
1084 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1087 addr &= PAGE_MASK_4K;
1088 size = r->end - addr;
1089 size = PAGE_ALIGN_4K(size);
1090 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1091 IOVA_PFN(size + addr) - 1);
1093 printk(KERN_ERR "Reserve iova failed\n");
1099 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1101 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1104 static inline int guestwidth_to_adjustwidth(int gaw)
1107 int r = (gaw - 12) % 9;
1118 static int domain_init(struct dmar_domain *domain, int guest_width)
1120 struct intel_iommu *iommu;
1121 int adjust_width, agaw;
1122 unsigned long sagaw;
1124 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1125 spin_lock_init(&domain->mapping_lock);
1127 domain_reserve_special_ranges(domain);
1129 /* calculate AGAW */
1130 iommu = domain->iommu;
1131 if (guest_width > cap_mgaw(iommu->cap))
1132 guest_width = cap_mgaw(iommu->cap);
1133 domain->gaw = guest_width;
1134 adjust_width = guestwidth_to_adjustwidth(guest_width);
1135 agaw = width_to_agaw(adjust_width);
1136 sagaw = cap_sagaw(iommu->cap);
1137 if (!test_bit(agaw, &sagaw)) {
1138 /* hardware doesn't support it, choose a bigger one */
1139 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1140 agaw = find_next_bit(&sagaw, 5, agaw);
1144 domain->agaw = agaw;
1145 INIT_LIST_HEAD(&domain->devices);
1147 /* always allocate the top pgd */
1148 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1151 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1155 static void domain_exit(struct dmar_domain *domain)
1159 /* Domain 0 is reserved, so dont process it */
1163 domain_remove_dev_info(domain);
1165 put_iova_domain(&domain->iovad);
1166 end = DOMAIN_MAX_ADDR(domain->gaw);
1167 end = end & (~PAGE_MASK_4K);
1170 dma_pte_clear_range(domain, 0, end);
1172 /* free page tables */
1173 dma_pte_free_pagetable(domain, 0, end);
1175 iommu_free_domain(domain);
1176 free_domain_mem(domain);
1179 static int domain_context_mapping_one(struct dmar_domain *domain,
1182 struct context_entry *context;
1183 struct intel_iommu *iommu = domain->iommu;
1184 unsigned long flags;
1186 pr_debug("Set context mapping for %02x:%02x.%d\n",
1187 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1188 BUG_ON(!domain->pgd);
1189 context = device_to_context_entry(iommu, bus, devfn);
1192 spin_lock_irqsave(&iommu->lock, flags);
1193 if (context_present(*context)) {
1194 spin_unlock_irqrestore(&iommu->lock, flags);
1198 context_set_domain_id(*context, domain->id);
1199 context_set_address_width(*context, domain->agaw);
1200 context_set_address_root(*context, virt_to_phys(domain->pgd));
1201 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1202 context_set_fault_enable(*context);
1203 context_set_present(*context);
1204 __iommu_flush_cache(iommu, context, sizeof(*context));
1206 /* it's a non-present to present mapping */
1207 if (iommu_flush_context_device(iommu, domain->id,
1208 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1209 iommu_flush_write_buffer(iommu);
1211 iommu_flush_iotlb_dsi(iommu, 0, 0);
1212 spin_unlock_irqrestore(&iommu->lock, flags);
1217 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1220 struct pci_dev *tmp, *parent;
1222 ret = domain_context_mapping_one(domain, pdev->bus->number,
1227 /* dependent device mapping */
1228 tmp = pci_find_upstream_pcie_bridge(pdev);
1231 /* Secondary interface's bus number and devfn 0 */
1232 parent = pdev->bus->self;
1233 while (parent != tmp) {
1234 ret = domain_context_mapping_one(domain, parent->bus->number,
1238 parent = parent->bus->self;
1240 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1241 return domain_context_mapping_one(domain,
1242 tmp->subordinate->number, 0);
1243 else /* this is a legacy PCI bridge */
1244 return domain_context_mapping_one(domain,
1245 tmp->bus->number, tmp->devfn);
1248 static int domain_context_mapped(struct dmar_domain *domain,
1249 struct pci_dev *pdev)
1252 struct pci_dev *tmp, *parent;
1254 ret = device_context_mapped(domain->iommu,
1255 pdev->bus->number, pdev->devfn);
1258 /* dependent device mapping */
1259 tmp = pci_find_upstream_pcie_bridge(pdev);
1262 /* Secondary interface's bus number and devfn 0 */
1263 parent = pdev->bus->self;
1264 while (parent != tmp) {
1265 ret = device_context_mapped(domain->iommu, parent->bus->number,
1269 parent = parent->bus->self;
1272 return device_context_mapped(domain->iommu,
1273 tmp->subordinate->number, 0);
1275 return device_context_mapped(domain->iommu,
1276 tmp->bus->number, tmp->devfn);
1280 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1281 u64 hpa, size_t size, int prot)
1283 u64 start_pfn, end_pfn;
1284 struct dma_pte *pte;
1287 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1289 iova &= PAGE_MASK_4K;
1290 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1291 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1293 while (start_pfn < end_pfn) {
1294 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1297 /* We don't need lock here, nobody else
1298 * touches the iova range
1300 BUG_ON(dma_pte_addr(*pte));
1301 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1302 dma_set_pte_prot(*pte, prot);
1303 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1310 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1312 clear_context_table(domain->iommu, bus, devfn);
1313 iommu_flush_context_global(domain->iommu, 0);
1314 iommu_flush_iotlb_global(domain->iommu, 0);
1317 static void domain_remove_dev_info(struct dmar_domain *domain)
1319 struct device_domain_info *info;
1320 unsigned long flags;
1322 spin_lock_irqsave(&device_domain_lock, flags);
1323 while (!list_empty(&domain->devices)) {
1324 info = list_entry(domain->devices.next,
1325 struct device_domain_info, link);
1326 list_del(&info->link);
1327 list_del(&info->global);
1329 info->dev->dev.archdata.iommu = NULL;
1330 spin_unlock_irqrestore(&device_domain_lock, flags);
1332 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1333 free_devinfo_mem(info);
1335 spin_lock_irqsave(&device_domain_lock, flags);
1337 spin_unlock_irqrestore(&device_domain_lock, flags);
1342 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1344 struct dmar_domain *
1345 find_domain(struct pci_dev *pdev)
1347 struct device_domain_info *info;
1349 /* No lock here, assumes no domain exit in normal case */
1350 info = pdev->dev.archdata.iommu;
1352 return info->domain;
1356 /* domain is initialized */
1357 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1359 struct dmar_domain *domain, *found = NULL;
1360 struct intel_iommu *iommu;
1361 struct dmar_drhd_unit *drhd;
1362 struct device_domain_info *info, *tmp;
1363 struct pci_dev *dev_tmp;
1364 unsigned long flags;
1365 int bus = 0, devfn = 0;
1367 domain = find_domain(pdev);
1371 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1373 if (dev_tmp->is_pcie) {
1374 bus = dev_tmp->subordinate->number;
1377 bus = dev_tmp->bus->number;
1378 devfn = dev_tmp->devfn;
1380 spin_lock_irqsave(&device_domain_lock, flags);
1381 list_for_each_entry(info, &device_domain_list, global) {
1382 if (info->bus == bus && info->devfn == devfn) {
1383 found = info->domain;
1387 spin_unlock_irqrestore(&device_domain_lock, flags);
1388 /* pcie-pci bridge already has a domain, uses it */
1395 /* Allocate new domain for the device */
1396 drhd = dmar_find_matched_drhd_unit(pdev);
1398 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1402 iommu = drhd->iommu;
1404 domain = iommu_alloc_domain(iommu);
1408 if (domain_init(domain, gaw)) {
1409 domain_exit(domain);
1413 /* register pcie-to-pci device */
1415 info = alloc_devinfo_mem();
1417 domain_exit(domain);
1421 info->devfn = devfn;
1423 info->domain = domain;
1424 /* This domain is shared by devices under p2p bridge */
1425 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1427 /* pcie-to-pci bridge already has a domain, uses it */
1429 spin_lock_irqsave(&device_domain_lock, flags);
1430 list_for_each_entry(tmp, &device_domain_list, global) {
1431 if (tmp->bus == bus && tmp->devfn == devfn) {
1432 found = tmp->domain;
1437 free_devinfo_mem(info);
1438 domain_exit(domain);
1441 list_add(&info->link, &domain->devices);
1442 list_add(&info->global, &device_domain_list);
1444 spin_unlock_irqrestore(&device_domain_lock, flags);
1448 info = alloc_devinfo_mem();
1451 info->bus = pdev->bus->number;
1452 info->devfn = pdev->devfn;
1454 info->domain = domain;
1455 spin_lock_irqsave(&device_domain_lock, flags);
1456 /* somebody is fast */
1457 found = find_domain(pdev);
1458 if (found != NULL) {
1459 spin_unlock_irqrestore(&device_domain_lock, flags);
1460 if (found != domain) {
1461 domain_exit(domain);
1464 free_devinfo_mem(info);
1467 list_add(&info->link, &domain->devices);
1468 list_add(&info->global, &device_domain_list);
1469 pdev->dev.archdata.iommu = info;
1470 spin_unlock_irqrestore(&device_domain_lock, flags);
1473 /* recheck it here, maybe others set it */
1474 return find_domain(pdev);
1477 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1479 struct dmar_domain *domain;
1485 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1486 pci_name(pdev), start, end);
1487 /* page table init */
1488 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1492 /* The address might not be aligned */
1493 base = start & PAGE_MASK_4K;
1495 size = PAGE_ALIGN_4K(size);
1496 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1497 IOVA_PFN(base + size) - 1)) {
1498 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1503 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1504 size, base, pci_name(pdev));
1506 * RMRR range might have overlap with physical memory range,
1509 dma_pte_clear_range(domain, base, base + size);
1511 ret = domain_page_mapping(domain, base, base, size,
1512 DMA_PTE_READ|DMA_PTE_WRITE);
1516 /* context entry init */
1517 ret = domain_context_mapping(domain, pdev);
1521 domain_exit(domain);
1526 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1527 struct pci_dev *pdev)
1529 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1531 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1532 rmrr->end_address + 1);
1535 #ifdef CONFIG_DMAR_GFX_WA
1536 struct iommu_prepare_data {
1537 struct pci_dev *pdev;
1541 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1542 unsigned long end_pfn, void *datax)
1544 struct iommu_prepare_data *data;
1546 data = (struct iommu_prepare_data *)datax;
1548 data->ret = iommu_prepare_identity_map(data->pdev,
1549 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1554 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1557 struct iommu_prepare_data data;
1562 for_each_online_node(nid) {
1563 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1570 static void __init iommu_prepare_gfx_mapping(void)
1572 struct pci_dev *pdev = NULL;
1575 for_each_pci_dev(pdev) {
1576 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1577 !IS_GFX_DEVICE(pdev))
1579 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1581 ret = iommu_prepare_with_active_regions(pdev);
1583 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1588 #ifdef CONFIG_DMAR_FLOPPY_WA
1589 static inline void iommu_prepare_isa(void)
1591 struct pci_dev *pdev;
1594 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1598 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1599 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1602 printk("IOMMU: Failed to create 0-64M identity map, "
1603 "floppy might not work\n");
1607 static inline void iommu_prepare_isa(void)
1611 #endif /* !CONFIG_DMAR_FLPY_WA */
1613 int __init init_dmars(void)
1615 struct dmar_drhd_unit *drhd;
1616 struct dmar_rmrr_unit *rmrr;
1617 struct pci_dev *pdev;
1618 struct intel_iommu *iommu;
1619 int i, ret, unit = 0;
1624 * initialize and program root entry to not present
1627 for_each_drhd_unit(drhd) {
1630 * lock not needed as this is only incremented in the single
1631 * threaded kernel __init code path all other access are read
1636 deferred_flush = kzalloc(g_num_of_iommus *
1637 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1638 if (!deferred_flush) {
1643 for_each_drhd_unit(drhd) {
1647 iommu = drhd->iommu;
1649 ret = iommu_init_domains(iommu);
1655 * we could share the same root & context tables
1656 * amoung all IOMMU's. Need to Split it later.
1658 ret = iommu_alloc_root_entry(iommu);
1660 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1667 * for each dev attached to rmrr
1669 * locate drhd for dev, alloc domain for dev
1670 * allocate free domain
1671 * allocate page table entries for rmrr
1672 * if context not allocated for bus
1673 * allocate and init context
1674 * set present in root table for this bus
1675 * init context with domain, translation etc
1679 for_each_rmrr_units(rmrr) {
1680 for (i = 0; i < rmrr->devices_cnt; i++) {
1681 pdev = rmrr->devices[i];
1682 /* some BIOS lists non-exist devices in DMAR table */
1685 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1688 "IOMMU: mapping reserved region failed\n");
1692 iommu_prepare_gfx_mapping();
1694 iommu_prepare_isa();
1699 * global invalidate context cache
1700 * global invalidate iotlb
1701 * enable translation
1703 for_each_drhd_unit(drhd) {
1706 iommu = drhd->iommu;
1707 sprintf (iommu->name, "dmar%d", unit++);
1709 iommu_flush_write_buffer(iommu);
1711 ret = dmar_set_interrupt(iommu);
1715 iommu_set_root_entry(iommu);
1717 iommu_flush_context_global(iommu, 0);
1718 iommu_flush_iotlb_global(iommu, 0);
1720 iommu_disable_protect_mem_regions(iommu);
1722 ret = iommu_enable_translation(iommu);
1729 for_each_drhd_unit(drhd) {
1732 iommu = drhd->iommu;
1738 static inline u64 aligned_size(u64 host_addr, size_t size)
1741 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1742 return PAGE_ALIGN_4K(addr);
1746 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1750 /* Make sure it's in range */
1751 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1752 if (!size || (IOVA_START_ADDR + size > end))
1755 piova = alloc_iova(&domain->iovad,
1756 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1760 static struct iova *
1761 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1764 struct pci_dev *pdev = to_pci_dev(dev);
1765 struct iova *iova = NULL;
1767 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1768 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1771 * First try to allocate an io virtual address in
1772 * DMA_32BIT_MASK and if that fails then try allocating
1775 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1777 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1781 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1788 static struct dmar_domain *
1789 get_valid_domain_for_dev(struct pci_dev *pdev)
1791 struct dmar_domain *domain;
1794 domain = get_domain_for_dev(pdev,
1795 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1798 "Allocating domain for %s failed", pci_name(pdev));
1802 /* make sure context mapping is ok */
1803 if (unlikely(!domain_context_mapped(domain, pdev))) {
1804 ret = domain_context_mapping(domain, pdev);
1807 "Domain context map for %s failed",
1817 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1819 struct pci_dev *pdev = to_pci_dev(hwdev);
1820 struct dmar_domain *domain;
1821 unsigned long start_paddr;
1826 BUG_ON(dir == DMA_NONE);
1827 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1830 domain = get_valid_domain_for_dev(pdev);
1834 size = aligned_size((u64)paddr, size);
1836 iova = __intel_alloc_iova(hwdev, domain, size);
1840 start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1843 * Check if DMAR supports zero-length reads on write only
1846 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1847 !cap_zlr(domain->iommu->cap))
1848 prot |= DMA_PTE_READ;
1849 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1850 prot |= DMA_PTE_WRITE;
1852 * paddr - (paddr + size) might be partial page, we should map the whole
1853 * page. Note: if two part of one page are separately mapped, we
1854 * might have two guest_addr mapping to the same host paddr, but this
1855 * is not a big problem
1857 ret = domain_page_mapping(domain, start_paddr,
1858 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1862 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1863 pci_name(pdev), size, (u64)paddr,
1864 size, (u64)start_paddr, dir);
1866 /* it's a non-present to present mapping */
1867 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1868 start_paddr, size >> PAGE_SHIFT_4K, 1);
1870 iommu_flush_write_buffer(domain->iommu);
1872 return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1876 __free_iova(&domain->iovad, iova);
1877 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1878 pci_name(pdev), size, (u64)paddr, dir);
1882 static void flush_unmaps(void)
1888 /* just flush them all */
1889 for (i = 0; i < g_num_of_iommus; i++) {
1890 if (deferred_flush[i].next) {
1891 struct intel_iommu *iommu =
1892 deferred_flush[i].domain[0]->iommu;
1894 iommu_flush_iotlb_global(iommu, 0);
1895 for (j = 0; j < deferred_flush[i].next; j++) {
1896 __free_iova(&deferred_flush[i].domain[j]->iovad,
1897 deferred_flush[i].iova[j]);
1899 deferred_flush[i].next = 0;
1906 static void flush_unmaps_timeout(unsigned long data)
1908 unsigned long flags;
1910 spin_lock_irqsave(&async_umap_flush_lock, flags);
1912 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1915 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1917 unsigned long flags;
1920 spin_lock_irqsave(&async_umap_flush_lock, flags);
1921 if (list_size == HIGH_WATER_MARK)
1924 iommu_id = dom->iommu->seq_id;
1926 next = deferred_flush[iommu_id].next;
1927 deferred_flush[iommu_id].domain[next] = dom;
1928 deferred_flush[iommu_id].iova[next] = iova;
1929 deferred_flush[iommu_id].next++;
1932 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1936 spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1939 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1940 size_t size, int dir)
1942 struct pci_dev *pdev = to_pci_dev(dev);
1943 struct dmar_domain *domain;
1944 unsigned long start_addr;
1947 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1949 domain = find_domain(pdev);
1952 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1956 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1957 size = aligned_size((u64)dev_addr, size);
1959 pr_debug("Device %s unmapping: %lx@%llx\n",
1960 pci_name(pdev), size, (u64)start_addr);
1962 /* clear the whole page */
1963 dma_pte_clear_range(domain, start_addr, start_addr + size);
1964 /* free page tables */
1965 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1966 if (intel_iommu_strict) {
1967 if (iommu_flush_iotlb_psi(domain->iommu,
1968 domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1969 iommu_flush_write_buffer(domain->iommu);
1971 __free_iova(&domain->iovad, iova);
1973 add_unmap(domain, iova);
1975 * queue up the release of the unmap to save the 1/6th of the
1976 * cpu used up by the iotlb flush operation...
1981 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1982 dma_addr_t *dma_handle, gfp_t flags)
1987 size = PAGE_ALIGN_4K(size);
1988 order = get_order(size);
1989 flags &= ~(GFP_DMA | GFP_DMA32);
1991 vaddr = (void *)__get_free_pages(flags, order);
1994 memset(vaddr, 0, size);
1996 *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
1999 free_pages((unsigned long)vaddr, order);
2003 static void intel_free_coherent(struct device *hwdev, size_t size,
2004 void *vaddr, dma_addr_t dma_handle)
2008 size = PAGE_ALIGN_4K(size);
2009 order = get_order(size);
2011 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2012 free_pages((unsigned long)vaddr, order);
2015 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2016 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2017 int nelems, int dir)
2020 struct pci_dev *pdev = to_pci_dev(hwdev);
2021 struct dmar_domain *domain;
2022 unsigned long start_addr;
2026 struct scatterlist *sg;
2028 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2031 domain = find_domain(pdev);
2033 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2036 for_each_sg(sglist, sg, nelems, i) {
2037 addr = SG_ENT_VIRT_ADDRESS(sg);
2038 size += aligned_size((u64)addr, sg->length);
2041 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2043 /* clear the whole page */
2044 dma_pte_clear_range(domain, start_addr, start_addr + size);
2045 /* free page tables */
2046 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2048 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2049 size >> PAGE_SHIFT_4K, 0))
2050 iommu_flush_write_buffer(domain->iommu);
2053 __free_iova(&domain->iovad, iova);
2056 static int intel_nontranslate_map_sg(struct device *hddev,
2057 struct scatterlist *sglist, int nelems, int dir)
2060 struct scatterlist *sg;
2062 for_each_sg(sglist, sg, nelems, i) {
2063 BUG_ON(!sg_page(sg));
2064 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2065 sg->dma_length = sg->length;
2070 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2071 int nelems, int dir)
2075 struct pci_dev *pdev = to_pci_dev(hwdev);
2076 struct dmar_domain *domain;
2080 struct iova *iova = NULL;
2082 struct scatterlist *sg;
2083 unsigned long start_addr;
2085 BUG_ON(dir == DMA_NONE);
2086 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2087 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2089 domain = get_valid_domain_for_dev(pdev);
2093 for_each_sg(sglist, sg, nelems, i) {
2094 addr = SG_ENT_VIRT_ADDRESS(sg);
2095 addr = (void *)virt_to_phys(addr);
2096 size += aligned_size((u64)addr, sg->length);
2099 iova = __intel_alloc_iova(hwdev, domain, size);
2101 sglist->dma_length = 0;
2106 * Check if DMAR supports zero-length reads on write only
2109 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2110 !cap_zlr(domain->iommu->cap))
2111 prot |= DMA_PTE_READ;
2112 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2113 prot |= DMA_PTE_WRITE;
2115 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2117 for_each_sg(sglist, sg, nelems, i) {
2118 addr = SG_ENT_VIRT_ADDRESS(sg);
2119 addr = (void *)virt_to_phys(addr);
2120 size = aligned_size((u64)addr, sg->length);
2121 ret = domain_page_mapping(domain, start_addr + offset,
2122 ((u64)addr) & PAGE_MASK_4K,
2125 /* clear the page */
2126 dma_pte_clear_range(domain, start_addr,
2127 start_addr + offset);
2128 /* free page tables */
2129 dma_pte_free_pagetable(domain, start_addr,
2130 start_addr + offset);
2132 __free_iova(&domain->iovad, iova);
2135 sg->dma_address = start_addr + offset +
2136 ((u64)addr & (~PAGE_MASK_4K));
2137 sg->dma_length = sg->length;
2141 /* it's a non-present to present mapping */
2142 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2143 start_addr, offset >> PAGE_SHIFT_4K, 1))
2144 iommu_flush_write_buffer(domain->iommu);
2148 static struct dma_mapping_ops intel_dma_ops = {
2149 .alloc_coherent = intel_alloc_coherent,
2150 .free_coherent = intel_free_coherent,
2151 .map_single = intel_map_single,
2152 .unmap_single = intel_unmap_single,
2153 .map_sg = intel_map_sg,
2154 .unmap_sg = intel_unmap_sg,
2157 static inline int iommu_domain_cache_init(void)
2161 iommu_domain_cache = kmem_cache_create("iommu_domain",
2162 sizeof(struct dmar_domain),
2167 if (!iommu_domain_cache) {
2168 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2175 static inline int iommu_devinfo_cache_init(void)
2179 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2180 sizeof(struct device_domain_info),
2185 if (!iommu_devinfo_cache) {
2186 printk(KERN_ERR "Couldn't create devinfo cache\n");
2193 static inline int iommu_iova_cache_init(void)
2197 iommu_iova_cache = kmem_cache_create("iommu_iova",
2198 sizeof(struct iova),
2203 if (!iommu_iova_cache) {
2204 printk(KERN_ERR "Couldn't create iova cache\n");
2211 static int __init iommu_init_mempool(void)
2214 ret = iommu_iova_cache_init();
2218 ret = iommu_domain_cache_init();
2222 ret = iommu_devinfo_cache_init();
2226 kmem_cache_destroy(iommu_domain_cache);
2228 kmem_cache_destroy(iommu_iova_cache);
2233 static void __init iommu_exit_mempool(void)
2235 kmem_cache_destroy(iommu_devinfo_cache);
2236 kmem_cache_destroy(iommu_domain_cache);
2237 kmem_cache_destroy(iommu_iova_cache);
2241 static void __init init_no_remapping_devices(void)
2243 struct dmar_drhd_unit *drhd;
2245 for_each_drhd_unit(drhd) {
2246 if (!drhd->include_all) {
2248 for (i = 0; i < drhd->devices_cnt; i++)
2249 if (drhd->devices[i] != NULL)
2251 /* ignore DMAR unit if no pci devices exist */
2252 if (i == drhd->devices_cnt)
2260 for_each_drhd_unit(drhd) {
2262 if (drhd->ignored || drhd->include_all)
2265 for (i = 0; i < drhd->devices_cnt; i++)
2266 if (drhd->devices[i] &&
2267 !IS_GFX_DEVICE(drhd->devices[i]))
2270 if (i < drhd->devices_cnt)
2273 /* bypass IOMMU if it is just for gfx devices */
2275 for (i = 0; i < drhd->devices_cnt; i++) {
2276 if (!drhd->devices[i])
2278 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2283 int __init intel_iommu_init(void)
2287 if (dmar_table_init())
2290 if (dmar_dev_scope_init())
2294 * Check the need for DMA-remapping initialization now.
2295 * Above initialization will also be used by Interrupt-remapping.
2297 if (no_iommu || swiotlb || dmar_disabled)
2300 iommu_init_mempool();
2301 dmar_init_reserved_ranges();
2303 init_no_remapping_devices();
2307 printk(KERN_ERR "IOMMU: dmar init failed\n");
2308 put_iova_domain(&reserved_iova_list);
2309 iommu_exit_mempool();
2313 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2315 init_timer(&unmap_timer);
2317 dma_ops = &intel_dma_ops;