2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) 2006-2008 Intel Corporation
18 * Author: Ashok Raj <ashok.raj@intel.com>
19 * Author: Shaohua Li <shaohua.li@intel.com>
20 * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/slab.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/sysdev.h>
29 #include <linux/spinlock.h>
30 #include <linux/pci.h>
31 #include <linux/dmar.h>
32 #include <linux/dma-mapping.h>
33 #include <linux/mempool.h>
35 #include "intel-iommu.h"
36 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
37 #include <asm/cacheflush.h>
41 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
42 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
44 #define IOAPIC_RANGE_START (0xfee00000)
45 #define IOAPIC_RANGE_END (0xfeefffff)
46 #define IOVA_START_ADDR (0x1000)
48 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
50 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
54 static void domain_remove_dev_info(struct dmar_domain *domain);
56 static int dmar_disabled;
57 static int __initdata dmar_map_gfx = 1;
58 static int dmar_forcedac;
60 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
61 static DEFINE_SPINLOCK(device_domain_lock);
62 static LIST_HEAD(device_domain_list);
64 static int __init intel_iommu_setup(char *str)
69 if (!strncmp(str, "off", 3)) {
71 printk(KERN_INFO"Intel-IOMMU: disabled\n");
72 } else if (!strncmp(str, "igfx_off", 8)) {
75 "Intel-IOMMU: disable GFX device mapping\n");
76 } else if (!strncmp(str, "forcedac", 8)) {
78 "Intel-IOMMU: Forcing DAC for PCI devices\n");
82 str += strcspn(str, ",");
88 __setup("intel_iommu=", intel_iommu_setup);
90 static struct kmem_cache *iommu_domain_cache;
91 static struct kmem_cache *iommu_devinfo_cache;
92 static struct kmem_cache *iommu_iova_cache;
94 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
99 /* trying to avoid low memory issues */
100 flags = current->flags & PF_MEMALLOC;
101 current->flags |= PF_MEMALLOC;
102 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
103 current->flags &= (~PF_MEMALLOC | flags);
108 static inline void *alloc_pgtable_page(void)
113 /* trying to avoid low memory issues */
114 flags = current->flags & PF_MEMALLOC;
115 current->flags |= PF_MEMALLOC;
116 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
117 current->flags &= (~PF_MEMALLOC | flags);
121 static inline void free_pgtable_page(void *vaddr)
123 free_page((unsigned long)vaddr);
126 static inline void *alloc_domain_mem(void)
128 return iommu_kmem_cache_alloc(iommu_domain_cache);
131 static inline void free_domain_mem(void *vaddr)
133 kmem_cache_free(iommu_domain_cache, vaddr);
136 static inline void * alloc_devinfo_mem(void)
138 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
141 static inline void free_devinfo_mem(void *vaddr)
143 kmem_cache_free(iommu_devinfo_cache, vaddr);
146 struct iova *alloc_iova_mem(void)
148 return iommu_kmem_cache_alloc(iommu_iova_cache);
151 void free_iova_mem(struct iova *iova)
153 kmem_cache_free(iommu_iova_cache, iova);
156 static inline void __iommu_flush_cache(
157 struct intel_iommu *iommu, void *addr, int size)
159 if (!ecap_coherent(iommu->ecap))
160 clflush_cache_range(addr, size);
163 /* Gets context entry for a given bus and devfn */
164 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
167 struct root_entry *root;
168 struct context_entry *context;
169 unsigned long phy_addr;
172 spin_lock_irqsave(&iommu->lock, flags);
173 root = &iommu->root_entry[bus];
174 context = get_context_addr_from_root(root);
176 context = (struct context_entry *)alloc_pgtable_page();
178 spin_unlock_irqrestore(&iommu->lock, flags);
181 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
182 phy_addr = virt_to_phys((void *)context);
183 set_root_value(root, phy_addr);
184 set_root_present(root);
185 __iommu_flush_cache(iommu, root, sizeof(*root));
187 spin_unlock_irqrestore(&iommu->lock, flags);
188 return &context[devfn];
191 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
193 struct root_entry *root;
194 struct context_entry *context;
198 spin_lock_irqsave(&iommu->lock, flags);
199 root = &iommu->root_entry[bus];
200 context = get_context_addr_from_root(root);
205 ret = context_present(context[devfn]);
207 spin_unlock_irqrestore(&iommu->lock, flags);
211 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
213 struct root_entry *root;
214 struct context_entry *context;
217 spin_lock_irqsave(&iommu->lock, flags);
218 root = &iommu->root_entry[bus];
219 context = get_context_addr_from_root(root);
221 context_clear_entry(context[devfn]);
222 __iommu_flush_cache(iommu, &context[devfn], \
225 spin_unlock_irqrestore(&iommu->lock, flags);
228 static void free_context_table(struct intel_iommu *iommu)
230 struct root_entry *root;
233 struct context_entry *context;
235 spin_lock_irqsave(&iommu->lock, flags);
236 if (!iommu->root_entry) {
239 for (i = 0; i < ROOT_ENTRY_NR; i++) {
240 root = &iommu->root_entry[i];
241 context = get_context_addr_from_root(root);
243 free_pgtable_page(context);
245 free_pgtable_page(iommu->root_entry);
246 iommu->root_entry = NULL;
248 spin_unlock_irqrestore(&iommu->lock, flags);
251 /* page table handling */
252 #define LEVEL_STRIDE (9)
253 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
255 static inline int agaw_to_level(int agaw)
260 static inline int agaw_to_width(int agaw)
262 return 30 + agaw * LEVEL_STRIDE;
266 static inline int width_to_agaw(int width)
268 return (width - 30) / LEVEL_STRIDE;
271 static inline unsigned int level_to_offset_bits(int level)
273 return (12 + (level - 1) * LEVEL_STRIDE);
276 static inline int address_level_offset(u64 addr, int level)
278 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
281 static inline u64 level_mask(int level)
283 return ((u64)-1 << level_to_offset_bits(level));
286 static inline u64 level_size(int level)
288 return ((u64)1 << level_to_offset_bits(level));
291 static inline u64 align_to_level(u64 addr, int level)
293 return ((addr + level_size(level) - 1) & level_mask(level));
296 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
298 int addr_width = agaw_to_width(domain->agaw);
299 struct dma_pte *parent, *pte = NULL;
300 int level = agaw_to_level(domain->agaw);
304 BUG_ON(!domain->pgd);
306 addr &= (((u64)1) << addr_width) - 1;
307 parent = domain->pgd;
309 spin_lock_irqsave(&domain->mapping_lock, flags);
313 offset = address_level_offset(addr, level);
314 pte = &parent[offset];
318 if (!dma_pte_present(*pte)) {
319 tmp_page = alloc_pgtable_page();
322 spin_unlock_irqrestore(&domain->mapping_lock,
326 __iommu_flush_cache(domain->iommu, tmp_page,
328 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
330 * high level table always sets r/w, last level page
331 * table control read/write
333 dma_set_pte_readable(*pte);
334 dma_set_pte_writable(*pte);
335 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
337 parent = phys_to_virt(dma_pte_addr(*pte));
341 spin_unlock_irqrestore(&domain->mapping_lock, flags);
345 /* return address's pte at specific level */
346 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
349 struct dma_pte *parent, *pte = NULL;
350 int total = agaw_to_level(domain->agaw);
353 parent = domain->pgd;
354 while (level <= total) {
355 offset = address_level_offset(addr, total);
356 pte = &parent[offset];
360 if (!dma_pte_present(*pte))
362 parent = phys_to_virt(dma_pte_addr(*pte));
368 /* clear one page's page table */
369 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
371 struct dma_pte *pte = NULL;
373 /* get last level pte */
374 pte = dma_addr_level_pte(domain, addr, 1);
378 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
382 /* clear last level pte, a tlb flush should be followed */
383 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
385 int addr_width = agaw_to_width(domain->agaw);
387 start &= (((u64)1) << addr_width) - 1;
388 end &= (((u64)1) << addr_width) - 1;
389 /* in case it's partial page */
390 start = PAGE_ALIGN_4K(start);
393 /* we don't need lock here, nobody else touches the iova range */
394 while (start < end) {
395 dma_pte_clear_one(domain, start);
396 start += PAGE_SIZE_4K;
400 /* free page table pages. last level pte should already be cleared */
401 static void dma_pte_free_pagetable(struct dmar_domain *domain,
404 int addr_width = agaw_to_width(domain->agaw);
406 int total = agaw_to_level(domain->agaw);
410 start &= (((u64)1) << addr_width) - 1;
411 end &= (((u64)1) << addr_width) - 1;
413 /* we don't need lock here, nobody else touches the iova range */
415 while (level <= total) {
416 tmp = align_to_level(start, level);
417 if (tmp >= end || (tmp + level_size(level) > end))
421 pte = dma_addr_level_pte(domain, tmp, level);
424 phys_to_virt(dma_pte_addr(*pte)));
426 __iommu_flush_cache(domain->iommu,
429 tmp += level_size(level);
434 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
435 free_pgtable_page(domain->pgd);
441 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
443 struct root_entry *root;
446 root = (struct root_entry *)alloc_pgtable_page();
450 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
452 spin_lock_irqsave(&iommu->lock, flags);
453 iommu->root_entry = root;
454 spin_unlock_irqrestore(&iommu->lock, flags);
459 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
461 unsigned long start_time = jiffies;\
463 sts = op (iommu->reg + offset);\
466 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
467 panic("DMAR hardware is malfunctioning\n");\
472 static void iommu_set_root_entry(struct intel_iommu *iommu)
478 addr = iommu->root_entry;
480 spin_lock_irqsave(&iommu->register_lock, flag);
481 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
483 cmd = iommu->gcmd | DMA_GCMD_SRTP;
484 writel(cmd, iommu->reg + DMAR_GCMD_REG);
486 /* Make sure hardware complete it */
487 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
488 readl, (sts & DMA_GSTS_RTPS), sts);
490 spin_unlock_irqrestore(&iommu->register_lock, flag);
493 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
498 if (!cap_rwbf(iommu->cap))
500 val = iommu->gcmd | DMA_GCMD_WBF;
502 spin_lock_irqsave(&iommu->register_lock, flag);
503 writel(val, iommu->reg + DMAR_GCMD_REG);
505 /* Make sure hardware complete it */
506 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
507 readl, (!(val & DMA_GSTS_WBFS)), val);
509 spin_unlock_irqrestore(&iommu->register_lock, flag);
512 /* return value determine if we need a write buffer flush */
513 static int __iommu_flush_context(struct intel_iommu *iommu,
514 u16 did, u16 source_id, u8 function_mask, u64 type,
515 int non_present_entry_flush)
521 * In the non-present entry flush case, if hardware doesn't cache
522 * non-present entry we do nothing and if hardware cache non-present
523 * entry, we flush entries of domain 0 (the domain id is used to cache
524 * any non-present entries)
526 if (non_present_entry_flush) {
527 if (!cap_caching_mode(iommu->cap))
534 case DMA_CCMD_GLOBAL_INVL:
535 val = DMA_CCMD_GLOBAL_INVL;
537 case DMA_CCMD_DOMAIN_INVL:
538 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
540 case DMA_CCMD_DEVICE_INVL:
541 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
542 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
549 spin_lock_irqsave(&iommu->register_lock, flag);
550 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
552 /* Make sure hardware complete it */
553 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
554 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
556 spin_unlock_irqrestore(&iommu->register_lock, flag);
558 /* flush context entry will implictly flush write buffer */
562 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
563 int non_present_entry_flush)
565 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
566 non_present_entry_flush);
569 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
570 int non_present_entry_flush)
572 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
573 non_present_entry_flush);
576 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
577 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
579 return __iommu_flush_context(iommu, did, source_id, function_mask,
580 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
583 /* return value determine if we need a write buffer flush */
584 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
585 u64 addr, unsigned int size_order, u64 type,
586 int non_present_entry_flush)
588 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
589 u64 val = 0, val_iva = 0;
593 * In the non-present entry flush case, if hardware doesn't cache
594 * non-present entry we do nothing and if hardware cache non-present
595 * entry, we flush entries of domain 0 (the domain id is used to cache
596 * any non-present entries)
598 if (non_present_entry_flush) {
599 if (!cap_caching_mode(iommu->cap))
606 case DMA_TLB_GLOBAL_FLUSH:
607 /* global flush doesn't need set IVA_REG */
608 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
610 case DMA_TLB_DSI_FLUSH:
611 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
613 case DMA_TLB_PSI_FLUSH:
614 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
615 /* Note: always flush non-leaf currently */
616 val_iva = size_order | addr;
621 /* Note: set drain read/write */
624 * This is probably to be super secure.. Looks like we can
625 * ignore it without any impact.
627 if (cap_read_drain(iommu->cap))
628 val |= DMA_TLB_READ_DRAIN;
630 if (cap_write_drain(iommu->cap))
631 val |= DMA_TLB_WRITE_DRAIN;
633 spin_lock_irqsave(&iommu->register_lock, flag);
634 /* Note: Only uses first TLB reg currently */
636 dmar_writeq(iommu->reg + tlb_offset, val_iva);
637 dmar_writeq(iommu->reg + tlb_offset + 8, val);
639 /* Make sure hardware complete it */
640 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
641 dmar_readq, (!(val & DMA_TLB_IVT)), val);
643 spin_unlock_irqrestore(&iommu->register_lock, flag);
645 /* check IOTLB invalidation granularity */
646 if (DMA_TLB_IAIG(val) == 0)
647 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
648 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
649 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
650 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
651 /* flush context entry will implictly flush write buffer */
655 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
656 int non_present_entry_flush)
658 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
659 non_present_entry_flush);
662 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
663 int non_present_entry_flush)
665 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
666 non_present_entry_flush);
669 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
670 u64 addr, unsigned int pages, int non_present_entry_flush)
674 BUG_ON(addr & (~PAGE_MASK_4K));
677 /* Fallback to domain selective flush if no PSI support */
678 if (!cap_pgsel_inv(iommu->cap))
679 return iommu_flush_iotlb_dsi(iommu, did,
680 non_present_entry_flush);
683 * PSI requires page size to be 2 ^ x, and the base address is naturally
684 * aligned to the size
686 mask = ilog2(__roundup_pow_of_two(pages));
687 /* Fallback to domain selective flush if size is too big */
688 if (mask > cap_max_amask_val(iommu->cap))
689 return iommu_flush_iotlb_dsi(iommu, did,
690 non_present_entry_flush);
692 return __iommu_flush_iotlb(iommu, did, addr, mask,
693 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
696 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
701 spin_lock_irqsave(&iommu->register_lock, flags);
702 pmen = readl(iommu->reg + DMAR_PMEN_REG);
703 pmen &= ~DMA_PMEN_EPM;
704 writel(pmen, iommu->reg + DMAR_PMEN_REG);
706 /* wait for the protected region status bit to clear */
707 IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
708 readl, !(pmen & DMA_PMEN_PRS), pmen);
710 spin_unlock_irqrestore(&iommu->register_lock, flags);
713 static int iommu_enable_translation(struct intel_iommu *iommu)
718 spin_lock_irqsave(&iommu->register_lock, flags);
719 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
721 /* Make sure hardware complete it */
722 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723 readl, (sts & DMA_GSTS_TES), sts);
725 iommu->gcmd |= DMA_GCMD_TE;
726 spin_unlock_irqrestore(&iommu->register_lock, flags);
730 static int iommu_disable_translation(struct intel_iommu *iommu)
735 spin_lock_irqsave(&iommu->register_lock, flag);
736 iommu->gcmd &= ~DMA_GCMD_TE;
737 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
739 /* Make sure hardware complete it */
740 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
741 readl, (!(sts & DMA_GSTS_TES)), sts);
743 spin_unlock_irqrestore(&iommu->register_lock, flag);
747 /* iommu interrupt handling. Most stuff are MSI-like. */
749 static const char *fault_reason_strings[] =
752 "Present bit in root entry is clear",
753 "Present bit in context entry is clear",
754 "Invalid context entry",
755 "Access beyond MGAW",
756 "PTE Write access is not set",
757 "PTE Read access is not set",
758 "Next page table ptr is invalid",
759 "Root table address invalid",
760 "Context table ptr is invalid",
761 "non-zero reserved fields in RTP",
762 "non-zero reserved fields in CTP",
763 "non-zero reserved fields in PTE",
765 #define MAX_FAULT_REASON_IDX (ARRAY_SIZE(fault_reason_strings) - 1)
767 const char *dmar_get_fault_reason(u8 fault_reason)
769 if (fault_reason > MAX_FAULT_REASON_IDX)
772 return fault_reason_strings[fault_reason];
775 void dmar_msi_unmask(unsigned int irq)
777 struct intel_iommu *iommu = get_irq_data(irq);
781 spin_lock_irqsave(&iommu->register_lock, flag);
782 writel(0, iommu->reg + DMAR_FECTL_REG);
783 /* Read a reg to force flush the post write */
784 readl(iommu->reg + DMAR_FECTL_REG);
785 spin_unlock_irqrestore(&iommu->register_lock, flag);
788 void dmar_msi_mask(unsigned int irq)
791 struct intel_iommu *iommu = get_irq_data(irq);
794 spin_lock_irqsave(&iommu->register_lock, flag);
795 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796 /* Read a reg to force flush the post write */
797 readl(iommu->reg + DMAR_FECTL_REG);
798 spin_unlock_irqrestore(&iommu->register_lock, flag);
801 void dmar_msi_write(int irq, struct msi_msg *msg)
803 struct intel_iommu *iommu = get_irq_data(irq);
806 spin_lock_irqsave(&iommu->register_lock, flag);
807 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810 spin_unlock_irqrestore(&iommu->register_lock, flag);
813 void dmar_msi_read(int irq, struct msi_msg *msg)
815 struct intel_iommu *iommu = get_irq_data(irq);
818 spin_lock_irqsave(&iommu->register_lock, flag);
819 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822 spin_unlock_irqrestore(&iommu->register_lock, flag);
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826 u8 fault_reason, u16 source_id, u64 addr)
830 reason = dmar_get_fault_reason(fault_reason);
833 "DMAR:[%s] Request device [%02x:%02x.%d] "
835 "DMAR:[fault reason %02d] %s\n",
836 (type ? "DMA Read" : "DMA Write"),
837 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
845 struct intel_iommu *iommu = dev_id;
846 int reg, fault_index;
850 spin_lock_irqsave(&iommu->register_lock, flag);
851 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
853 /* TBD: ignore advanced fault log currently */
854 if (!(fault_status & DMA_FSTS_PPF))
857 fault_index = dma_fsts_fault_record_index(fault_status);
858 reg = cap_fault_reg_offset(iommu->cap);
866 /* highest 32 bits */
867 data = readl(iommu->reg + reg +
868 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869 if (!(data & DMA_FRCD_F))
872 fault_reason = dma_frcd_fault_reason(data);
873 type = dma_frcd_type(data);
875 data = readl(iommu->reg + reg +
876 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877 source_id = dma_frcd_source_id(data);
879 guest_addr = dmar_readq(iommu->reg + reg +
880 fault_index * PRIMARY_FAULT_REG_LEN);
881 guest_addr = dma_frcd_page_addr(guest_addr);
882 /* clear the fault */
883 writel(DMA_FRCD_F, iommu->reg + reg +
884 fault_index * PRIMARY_FAULT_REG_LEN + 12);
886 spin_unlock_irqrestore(&iommu->register_lock, flag);
888 iommu_page_fault_do_one(iommu, type, fault_reason,
889 source_id, guest_addr);
892 if (fault_index > cap_num_fault_regs(iommu->cap))
894 spin_lock_irqsave(&iommu->register_lock, flag);
897 /* clear primary fault overflow */
898 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899 if (fault_status & DMA_FSTS_PFO)
900 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
902 spin_unlock_irqrestore(&iommu->register_lock, flag);
906 int dmar_set_interrupt(struct intel_iommu *iommu)
912 printk(KERN_ERR "IOMMU: no free vectors\n");
916 set_irq_data(irq, iommu);
919 ret = arch_setup_dmar_msi(irq);
921 set_irq_data(irq, NULL);
927 /* Force fault register is cleared */
928 iommu_page_fault(irq, iommu);
930 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
932 printk(KERN_ERR "IOMMU: can't request irq\n");
936 static int iommu_init_domains(struct intel_iommu *iommu)
938 unsigned long ndomains;
939 unsigned long nlongs;
941 ndomains = cap_ndoms(iommu->cap);
942 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943 nlongs = BITS_TO_LONGS(ndomains);
945 /* TBD: there might be 64K domains,
946 * consider other allocation for future chip
948 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949 if (!iommu->domain_ids) {
950 printk(KERN_ERR "Allocating domain id array failed\n");
953 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
955 if (!iommu->domains) {
956 printk(KERN_ERR "Allocating domain array failed\n");
957 kfree(iommu->domain_ids);
962 * if Caching mode is set, then invalid translations are tagged
963 * with domainid 0. Hence we need to pre-allocate it.
965 if (cap_caching_mode(iommu->cap))
966 set_bit(0, iommu->domain_ids);
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
972 struct intel_iommu *iommu;
977 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
980 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
982 printk(KERN_ERR "IOMMU: can't map the region\n");
985 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
988 /* the registers might be more than one page */
989 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990 cap_max_fault_reg_offset(iommu->cap));
991 map_size = PAGE_ALIGN_4K(map_size);
992 if (map_size > PAGE_SIZE_4K) {
994 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
996 printk(KERN_ERR "IOMMU: can't map the region\n");
1001 ver = readl(iommu->reg + DMAR_VER_REG);
1002 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004 iommu->cap, iommu->ecap);
1005 ret = iommu_init_domains(iommu);
1008 spin_lock_init(&iommu->lock);
1009 spin_lock_init(&iommu->register_lock);
1011 drhd->iommu = iommu;
1014 iounmap(iommu->reg);
1020 static void domain_exit(struct dmar_domain *domain);
1021 static void free_iommu(struct intel_iommu *iommu)
1023 struct dmar_domain *domain;
1029 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1030 for (; i < cap_ndoms(iommu->cap); ) {
1031 domain = iommu->domains[i];
1032 clear_bit(i, iommu->domain_ids);
1033 domain_exit(domain);
1034 i = find_next_bit(iommu->domain_ids,
1035 cap_ndoms(iommu->cap), i+1);
1038 if (iommu->gcmd & DMA_GCMD_TE)
1039 iommu_disable_translation(iommu);
1042 set_irq_data(iommu->irq, NULL);
1043 /* This will mask the irq */
1044 free_irq(iommu->irq, iommu);
1045 destroy_irq(iommu->irq);
1048 kfree(iommu->domains);
1049 kfree(iommu->domain_ids);
1051 /* free context mapping */
1052 free_context_table(iommu);
1055 iounmap(iommu->reg);
1059 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1062 unsigned long ndomains;
1063 struct dmar_domain *domain;
1064 unsigned long flags;
1066 domain = alloc_domain_mem();
1070 ndomains = cap_ndoms(iommu->cap);
1072 spin_lock_irqsave(&iommu->lock, flags);
1073 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1074 if (num >= ndomains) {
1075 spin_unlock_irqrestore(&iommu->lock, flags);
1076 free_domain_mem(domain);
1077 printk(KERN_ERR "IOMMU: no free domain ids\n");
1081 set_bit(num, iommu->domain_ids);
1083 domain->iommu = iommu;
1084 iommu->domains[num] = domain;
1085 spin_unlock_irqrestore(&iommu->lock, flags);
1090 static void iommu_free_domain(struct dmar_domain *domain)
1092 unsigned long flags;
1094 spin_lock_irqsave(&domain->iommu->lock, flags);
1095 clear_bit(domain->id, domain->iommu->domain_ids);
1096 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1099 static struct iova_domain reserved_iova_list;
1101 static void dmar_init_reserved_ranges(void)
1103 struct pci_dev *pdev = NULL;
1108 init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1110 /* IOAPIC ranges shouldn't be accessed by DMA */
1111 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1112 IOVA_PFN(IOAPIC_RANGE_END));
1114 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1116 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1117 for_each_pci_dev(pdev) {
1120 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1121 r = &pdev->resource[i];
1122 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1125 addr &= PAGE_MASK_4K;
1126 size = r->end - addr;
1127 size = PAGE_ALIGN_4K(size);
1128 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1129 IOVA_PFN(size + addr) - 1);
1131 printk(KERN_ERR "Reserve iova failed\n");
1137 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1139 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1142 static inline int guestwidth_to_adjustwidth(int gaw)
1145 int r = (gaw - 12) % 9;
1156 static int domain_init(struct dmar_domain *domain, int guest_width)
1158 struct intel_iommu *iommu;
1159 int adjust_width, agaw;
1160 unsigned long sagaw;
1162 init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1163 spin_lock_init(&domain->mapping_lock);
1165 domain_reserve_special_ranges(domain);
1167 /* calculate AGAW */
1168 iommu = domain->iommu;
1169 if (guest_width > cap_mgaw(iommu->cap))
1170 guest_width = cap_mgaw(iommu->cap);
1171 domain->gaw = guest_width;
1172 adjust_width = guestwidth_to_adjustwidth(guest_width);
1173 agaw = width_to_agaw(adjust_width);
1174 sagaw = cap_sagaw(iommu->cap);
1175 if (!test_bit(agaw, &sagaw)) {
1176 /* hardware doesn't support it, choose a bigger one */
1177 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1178 agaw = find_next_bit(&sagaw, 5, agaw);
1182 domain->agaw = agaw;
1183 INIT_LIST_HEAD(&domain->devices);
1185 /* always allocate the top pgd */
1186 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1189 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1193 static void domain_exit(struct dmar_domain *domain)
1197 /* Domain 0 is reserved, so dont process it */
1201 domain_remove_dev_info(domain);
1203 put_iova_domain(&domain->iovad);
1204 end = DOMAIN_MAX_ADDR(domain->gaw);
1205 end = end & (~PAGE_MASK_4K);
1208 dma_pte_clear_range(domain, 0, end);
1210 /* free page tables */
1211 dma_pte_free_pagetable(domain, 0, end);
1213 iommu_free_domain(domain);
1214 free_domain_mem(domain);
1217 static int domain_context_mapping_one(struct dmar_domain *domain,
1220 struct context_entry *context;
1221 struct intel_iommu *iommu = domain->iommu;
1222 unsigned long flags;
1224 pr_debug("Set context mapping for %02x:%02x.%d\n",
1225 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1226 BUG_ON(!domain->pgd);
1227 context = device_to_context_entry(iommu, bus, devfn);
1230 spin_lock_irqsave(&iommu->lock, flags);
1231 if (context_present(*context)) {
1232 spin_unlock_irqrestore(&iommu->lock, flags);
1236 context_set_domain_id(*context, domain->id);
1237 context_set_address_width(*context, domain->agaw);
1238 context_set_address_root(*context, virt_to_phys(domain->pgd));
1239 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1240 context_set_fault_enable(*context);
1241 context_set_present(*context);
1242 __iommu_flush_cache(iommu, context, sizeof(*context));
1244 /* it's a non-present to present mapping */
1245 if (iommu_flush_context_device(iommu, domain->id,
1246 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1247 iommu_flush_write_buffer(iommu);
1249 iommu_flush_iotlb_dsi(iommu, 0, 0);
1250 spin_unlock_irqrestore(&iommu->lock, flags);
1255 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1258 struct pci_dev *tmp, *parent;
1260 ret = domain_context_mapping_one(domain, pdev->bus->number,
1265 /* dependent device mapping */
1266 tmp = pci_find_upstream_pcie_bridge(pdev);
1269 /* Secondary interface's bus number and devfn 0 */
1270 parent = pdev->bus->self;
1271 while (parent != tmp) {
1272 ret = domain_context_mapping_one(domain, parent->bus->number,
1276 parent = parent->bus->self;
1278 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1279 return domain_context_mapping_one(domain,
1280 tmp->subordinate->number, 0);
1281 else /* this is a legacy PCI bridge */
1282 return domain_context_mapping_one(domain,
1283 tmp->bus->number, tmp->devfn);
1286 static int domain_context_mapped(struct dmar_domain *domain,
1287 struct pci_dev *pdev)
1290 struct pci_dev *tmp, *parent;
1292 ret = device_context_mapped(domain->iommu,
1293 pdev->bus->number, pdev->devfn);
1296 /* dependent device mapping */
1297 tmp = pci_find_upstream_pcie_bridge(pdev);
1300 /* Secondary interface's bus number and devfn 0 */
1301 parent = pdev->bus->self;
1302 while (parent != tmp) {
1303 ret = device_context_mapped(domain->iommu, parent->bus->number,
1307 parent = parent->bus->self;
1310 return device_context_mapped(domain->iommu,
1311 tmp->subordinate->number, 0);
1313 return device_context_mapped(domain->iommu,
1314 tmp->bus->number, tmp->devfn);
1318 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1319 u64 hpa, size_t size, int prot)
1321 u64 start_pfn, end_pfn;
1322 struct dma_pte *pte;
1325 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1327 iova &= PAGE_MASK_4K;
1328 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1329 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1331 while (start_pfn < end_pfn) {
1332 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1335 /* We don't need lock here, nobody else
1336 * touches the iova range
1338 BUG_ON(dma_pte_addr(*pte));
1339 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1340 dma_set_pte_prot(*pte, prot);
1341 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1348 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1350 clear_context_table(domain->iommu, bus, devfn);
1351 iommu_flush_context_global(domain->iommu, 0);
1352 iommu_flush_iotlb_global(domain->iommu, 0);
1355 static void domain_remove_dev_info(struct dmar_domain *domain)
1357 struct device_domain_info *info;
1358 unsigned long flags;
1360 spin_lock_irqsave(&device_domain_lock, flags);
1361 while (!list_empty(&domain->devices)) {
1362 info = list_entry(domain->devices.next,
1363 struct device_domain_info, link);
1364 list_del(&info->link);
1365 list_del(&info->global);
1367 info->dev->dev.archdata.iommu = NULL;
1368 spin_unlock_irqrestore(&device_domain_lock, flags);
1370 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1371 free_devinfo_mem(info);
1373 spin_lock_irqsave(&device_domain_lock, flags);
1375 spin_unlock_irqrestore(&device_domain_lock, flags);
1380 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1382 struct dmar_domain *
1383 find_domain(struct pci_dev *pdev)
1385 struct device_domain_info *info;
1387 /* No lock here, assumes no domain exit in normal case */
1388 info = pdev->dev.archdata.iommu;
1390 return info->domain;
1394 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1395 struct pci_dev *dev)
1400 for (index = 0; index < cnt; index ++)
1401 if (dev == devices[index])
1404 /* Check our parent */
1405 dev = dev->bus->self;
1411 static struct dmar_drhd_unit *
1412 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1414 struct dmar_drhd_unit *drhd = NULL;
1416 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1417 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1418 drhd->devices_cnt, dev))
1425 /* domain is initialized */
1426 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1428 struct dmar_domain *domain, *found = NULL;
1429 struct intel_iommu *iommu;
1430 struct dmar_drhd_unit *drhd;
1431 struct device_domain_info *info, *tmp;
1432 struct pci_dev *dev_tmp;
1433 unsigned long flags;
1434 int bus = 0, devfn = 0;
1436 domain = find_domain(pdev);
1440 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1442 if (dev_tmp->is_pcie) {
1443 bus = dev_tmp->subordinate->number;
1446 bus = dev_tmp->bus->number;
1447 devfn = dev_tmp->devfn;
1449 spin_lock_irqsave(&device_domain_lock, flags);
1450 list_for_each_entry(info, &device_domain_list, global) {
1451 if (info->bus == bus && info->devfn == devfn) {
1452 found = info->domain;
1456 spin_unlock_irqrestore(&device_domain_lock, flags);
1457 /* pcie-pci bridge already has a domain, uses it */
1464 /* Allocate new domain for the device */
1465 drhd = dmar_find_matched_drhd_unit(pdev);
1467 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1471 iommu = drhd->iommu;
1473 domain = iommu_alloc_domain(iommu);
1477 if (domain_init(domain, gaw)) {
1478 domain_exit(domain);
1482 /* register pcie-to-pci device */
1484 info = alloc_devinfo_mem();
1486 domain_exit(domain);
1490 info->devfn = devfn;
1492 info->domain = domain;
1493 /* This domain is shared by devices under p2p bridge */
1494 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1496 /* pcie-to-pci bridge already has a domain, uses it */
1498 spin_lock_irqsave(&device_domain_lock, flags);
1499 list_for_each_entry(tmp, &device_domain_list, global) {
1500 if (tmp->bus == bus && tmp->devfn == devfn) {
1501 found = tmp->domain;
1506 free_devinfo_mem(info);
1507 domain_exit(domain);
1510 list_add(&info->link, &domain->devices);
1511 list_add(&info->global, &device_domain_list);
1513 spin_unlock_irqrestore(&device_domain_lock, flags);
1517 info = alloc_devinfo_mem();
1520 info->bus = pdev->bus->number;
1521 info->devfn = pdev->devfn;
1523 info->domain = domain;
1524 spin_lock_irqsave(&device_domain_lock, flags);
1525 /* somebody is fast */
1526 found = find_domain(pdev);
1527 if (found != NULL) {
1528 spin_unlock_irqrestore(&device_domain_lock, flags);
1529 if (found != domain) {
1530 domain_exit(domain);
1533 free_devinfo_mem(info);
1536 list_add(&info->link, &domain->devices);
1537 list_add(&info->global, &device_domain_list);
1538 pdev->dev.archdata.iommu = info;
1539 spin_unlock_irqrestore(&device_domain_lock, flags);
1542 /* recheck it here, maybe others set it */
1543 return find_domain(pdev);
1546 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1548 struct dmar_domain *domain;
1554 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1555 pci_name(pdev), start, end);
1556 /* page table init */
1557 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1561 /* The address might not be aligned */
1562 base = start & PAGE_MASK_4K;
1564 size = PAGE_ALIGN_4K(size);
1565 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1566 IOVA_PFN(base + size) - 1)) {
1567 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1572 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1573 size, base, pci_name(pdev));
1575 * RMRR range might have overlap with physical memory range,
1578 dma_pte_clear_range(domain, base, base + size);
1580 ret = domain_page_mapping(domain, base, base, size,
1581 DMA_PTE_READ|DMA_PTE_WRITE);
1585 /* context entry init */
1586 ret = domain_context_mapping(domain, pdev);
1590 domain_exit(domain);
1595 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1596 struct pci_dev *pdev)
1598 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1600 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1601 rmrr->end_address + 1);
1604 #ifdef CONFIG_DMAR_GFX_WA
1605 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1606 static void __init iommu_prepare_gfx_mapping(void)
1608 struct pci_dev *pdev = NULL;
1613 for_each_pci_dev(pdev) {
1614 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1615 !IS_GFX_DEVICE(pdev))
1617 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1619 slot = arch_get_ram_range(0, &base, &size);
1621 ret = iommu_prepare_identity_map(pdev,
1625 slot = arch_get_ram_range(slot, &base, &size);
1629 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1634 #ifdef CONFIG_DMAR_FLOPPY_WA
1635 static inline void iommu_prepare_isa(void)
1637 struct pci_dev *pdev;
1640 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1644 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1645 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1648 printk("IOMMU: Failed to create 0-64M identity map, "
1649 "floppy might not work\n");
1653 static inline void iommu_prepare_isa(void)
1657 #endif /* !CONFIG_DMAR_FLPY_WA */
1659 int __init init_dmars(void)
1661 struct dmar_drhd_unit *drhd;
1662 struct dmar_rmrr_unit *rmrr;
1663 struct pci_dev *pdev;
1664 struct intel_iommu *iommu;
1670 * initialize and program root entry to not present
1673 for_each_drhd_unit(drhd) {
1676 iommu = alloc_iommu(drhd);
1684 * we could share the same root & context tables
1685 * amoung all IOMMU's. Need to Split it later.
1687 ret = iommu_alloc_root_entry(iommu);
1689 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1696 * for each dev attached to rmrr
1698 * locate drhd for dev, alloc domain for dev
1699 * allocate free domain
1700 * allocate page table entries for rmrr
1701 * if context not allocated for bus
1702 * allocate and init context
1703 * set present in root table for this bus
1704 * init context with domain, translation etc
1708 for_each_rmrr_units(rmrr) {
1710 for (i = 0; i < rmrr->devices_cnt; i++) {
1711 pdev = rmrr->devices[i];
1712 /* some BIOS lists non-exist devices in DMAR table */
1715 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1718 "IOMMU: mapping reserved region failed\n");
1722 iommu_prepare_gfx_mapping();
1724 iommu_prepare_isa();
1729 * global invalidate context cache
1730 * global invalidate iotlb
1731 * enable translation
1733 for_each_drhd_unit(drhd) {
1736 iommu = drhd->iommu;
1737 sprintf (iommu->name, "dmar%d", unit++);
1739 iommu_flush_write_buffer(iommu);
1741 ret = dmar_set_interrupt(iommu);
1745 iommu_set_root_entry(iommu);
1747 iommu_flush_context_global(iommu, 0);
1748 iommu_flush_iotlb_global(iommu, 0);
1750 iommu_disable_protect_mem_regions(iommu);
1752 ret = iommu_enable_translation(iommu);
1759 for_each_drhd_unit(drhd) {
1762 iommu = drhd->iommu;
1768 static inline u64 aligned_size(u64 host_addr, size_t size)
1771 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1772 return PAGE_ALIGN_4K(addr);
1776 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1780 /* Make sure it's in range */
1781 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1782 if (!size || (IOVA_START_ADDR + size > end))
1785 piova = alloc_iova(&domain->iovad,
1786 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1790 static struct iova *
1791 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1794 struct pci_dev *pdev = to_pci_dev(dev);
1795 struct iova *iova = NULL;
1797 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1798 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1801 * First try to allocate an io virtual address in
1802 * DMA_32BIT_MASK and if that fails then try allocating
1805 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1807 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1811 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1818 static struct dmar_domain *
1819 get_valid_domain_for_dev(struct pci_dev *pdev)
1821 struct dmar_domain *domain;
1824 domain = get_domain_for_dev(pdev,
1825 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1828 "Allocating domain for %s failed", pci_name(pdev));
1832 /* make sure context mapping is ok */
1833 if (unlikely(!domain_context_mapped(domain, pdev))) {
1834 ret = domain_context_mapping(domain, pdev);
1837 "Domain context map for %s failed",
1846 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1847 size_t size, int dir)
1849 struct pci_dev *pdev = to_pci_dev(hwdev);
1851 struct dmar_domain *domain;
1852 unsigned long start_addr;
1856 BUG_ON(dir == DMA_NONE);
1857 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1858 return virt_to_bus(addr);
1860 domain = get_valid_domain_for_dev(pdev);
1864 addr = (void *)virt_to_phys(addr);
1865 size = aligned_size((u64)addr, size);
1867 iova = __intel_alloc_iova(hwdev, domain, size);
1871 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1874 * Check if DMAR supports zero-length reads on write only
1877 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1878 !cap_zlr(domain->iommu->cap))
1879 prot |= DMA_PTE_READ;
1880 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1881 prot |= DMA_PTE_WRITE;
1883 * addr - (addr + size) might be partial page, we should map the whole
1884 * page. Note: if two part of one page are separately mapped, we
1885 * might have two guest_addr mapping to the same host addr, but this
1886 * is not a big problem
1888 ret = domain_page_mapping(domain, start_addr,
1889 ((u64)addr) & PAGE_MASK_4K, size, prot);
1893 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1894 pci_name(pdev), size, (u64)addr,
1895 size, (u64)start_addr, dir);
1897 /* it's a non-present to present mapping */
1898 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1899 start_addr, size >> PAGE_SHIFT_4K, 1);
1901 iommu_flush_write_buffer(domain->iommu);
1903 return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1907 __free_iova(&domain->iovad, iova);
1908 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1909 pci_name(pdev), size, (u64)addr, dir);
1913 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1914 size_t size, int dir)
1916 struct pci_dev *pdev = to_pci_dev(dev);
1917 struct dmar_domain *domain;
1918 unsigned long start_addr;
1921 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1923 domain = find_domain(pdev);
1926 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1930 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1931 size = aligned_size((u64)dev_addr, size);
1933 pr_debug("Device %s unmapping: %lx@%llx\n",
1934 pci_name(pdev), size, (u64)start_addr);
1936 /* clear the whole page */
1937 dma_pte_clear_range(domain, start_addr, start_addr + size);
1938 /* free page tables */
1939 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1941 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1942 size >> PAGE_SHIFT_4K, 0))
1943 iommu_flush_write_buffer(domain->iommu);
1946 __free_iova(&domain->iovad, iova);
1949 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1950 dma_addr_t *dma_handle, gfp_t flags)
1955 size = PAGE_ALIGN_4K(size);
1956 order = get_order(size);
1957 flags &= ~(GFP_DMA | GFP_DMA32);
1959 vaddr = (void *)__get_free_pages(flags, order);
1962 memset(vaddr, 0, size);
1964 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1967 free_pages((unsigned long)vaddr, order);
1971 static void intel_free_coherent(struct device *hwdev, size_t size,
1972 void *vaddr, dma_addr_t dma_handle)
1976 size = PAGE_ALIGN_4K(size);
1977 order = get_order(size);
1979 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1980 free_pages((unsigned long)vaddr, order);
1983 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1984 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1985 int nelems, int dir)
1988 struct pci_dev *pdev = to_pci_dev(hwdev);
1989 struct dmar_domain *domain;
1990 unsigned long start_addr;
1994 struct scatterlist *sg;
1996 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1999 domain = find_domain(pdev);
2001 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2004 for_each_sg(sglist, sg, nelems, i) {
2005 addr = SG_ENT_VIRT_ADDRESS(sg);
2006 size += aligned_size((u64)addr, sg->length);
2009 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2011 /* clear the whole page */
2012 dma_pte_clear_range(domain, start_addr, start_addr + size);
2013 /* free page tables */
2014 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2016 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2017 size >> PAGE_SHIFT_4K, 0))
2018 iommu_flush_write_buffer(domain->iommu);
2021 __free_iova(&domain->iovad, iova);
2024 static int intel_nontranslate_map_sg(struct device *hddev,
2025 struct scatterlist *sglist, int nelems, int dir)
2028 struct scatterlist *sg;
2030 for_each_sg(sglist, sg, nelems, i) {
2031 BUG_ON(!sg_page(sg));
2032 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2033 sg->dma_length = sg->length;
2038 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2039 int nelems, int dir)
2043 struct pci_dev *pdev = to_pci_dev(hwdev);
2044 struct dmar_domain *domain;
2048 struct iova *iova = NULL;
2050 struct scatterlist *sg;
2051 unsigned long start_addr;
2053 BUG_ON(dir == DMA_NONE);
2054 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2055 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2057 domain = get_valid_domain_for_dev(pdev);
2061 for_each_sg(sglist, sg, nelems, i) {
2062 addr = SG_ENT_VIRT_ADDRESS(sg);
2063 addr = (void *)virt_to_phys(addr);
2064 size += aligned_size((u64)addr, sg->length);
2067 iova = __intel_alloc_iova(hwdev, domain, size);
2069 sglist->dma_length = 0;
2074 * Check if DMAR supports zero-length reads on write only
2077 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2078 !cap_zlr(domain->iommu->cap))
2079 prot |= DMA_PTE_READ;
2080 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2081 prot |= DMA_PTE_WRITE;
2083 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2085 for_each_sg(sglist, sg, nelems, i) {
2086 addr = SG_ENT_VIRT_ADDRESS(sg);
2087 addr = (void *)virt_to_phys(addr);
2088 size = aligned_size((u64)addr, sg->length);
2089 ret = domain_page_mapping(domain, start_addr + offset,
2090 ((u64)addr) & PAGE_MASK_4K,
2093 /* clear the page */
2094 dma_pte_clear_range(domain, start_addr,
2095 start_addr + offset);
2096 /* free page tables */
2097 dma_pte_free_pagetable(domain, start_addr,
2098 start_addr + offset);
2100 __free_iova(&domain->iovad, iova);
2103 sg->dma_address = start_addr + offset +
2104 ((u64)addr & (~PAGE_MASK_4K));
2105 sg->dma_length = sg->length;
2109 /* it's a non-present to present mapping */
2110 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2111 start_addr, offset >> PAGE_SHIFT_4K, 1))
2112 iommu_flush_write_buffer(domain->iommu);
2116 static struct dma_mapping_ops intel_dma_ops = {
2117 .alloc_coherent = intel_alloc_coherent,
2118 .free_coherent = intel_free_coherent,
2119 .map_single = intel_map_single,
2120 .unmap_single = intel_unmap_single,
2121 .map_sg = intel_map_sg,
2122 .unmap_sg = intel_unmap_sg,
2125 static inline int iommu_domain_cache_init(void)
2129 iommu_domain_cache = kmem_cache_create("iommu_domain",
2130 sizeof(struct dmar_domain),
2135 if (!iommu_domain_cache) {
2136 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2143 static inline int iommu_devinfo_cache_init(void)
2147 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2148 sizeof(struct device_domain_info),
2153 if (!iommu_devinfo_cache) {
2154 printk(KERN_ERR "Couldn't create devinfo cache\n");
2161 static inline int iommu_iova_cache_init(void)
2165 iommu_iova_cache = kmem_cache_create("iommu_iova",
2166 sizeof(struct iova),
2171 if (!iommu_iova_cache) {
2172 printk(KERN_ERR "Couldn't create iova cache\n");
2179 static int __init iommu_init_mempool(void)
2182 ret = iommu_iova_cache_init();
2186 ret = iommu_domain_cache_init();
2190 ret = iommu_devinfo_cache_init();
2194 kmem_cache_destroy(iommu_domain_cache);
2196 kmem_cache_destroy(iommu_iova_cache);
2201 static void __init iommu_exit_mempool(void)
2203 kmem_cache_destroy(iommu_devinfo_cache);
2204 kmem_cache_destroy(iommu_domain_cache);
2205 kmem_cache_destroy(iommu_iova_cache);
2209 void __init detect_intel_iommu(void)
2211 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2213 if (early_dmar_detect()) {
2218 static void __init init_no_remapping_devices(void)
2220 struct dmar_drhd_unit *drhd;
2222 for_each_drhd_unit(drhd) {
2223 if (!drhd->include_all) {
2225 for (i = 0; i < drhd->devices_cnt; i++)
2226 if (drhd->devices[i] != NULL)
2228 /* ignore DMAR unit if no pci devices exist */
2229 if (i == drhd->devices_cnt)
2237 for_each_drhd_unit(drhd) {
2239 if (drhd->ignored || drhd->include_all)
2242 for (i = 0; i < drhd->devices_cnt; i++)
2243 if (drhd->devices[i] &&
2244 !IS_GFX_DEVICE(drhd->devices[i]))
2247 if (i < drhd->devices_cnt)
2250 /* bypass IOMMU if it is just for gfx devices */
2252 for (i = 0; i < drhd->devices_cnt; i++) {
2253 if (!drhd->devices[i])
2255 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2260 int __init intel_iommu_init(void)
2264 if (no_iommu || swiotlb || dmar_disabled)
2267 if (dmar_table_init())
2270 iommu_init_mempool();
2271 dmar_init_reserved_ranges();
2273 init_no_remapping_devices();
2277 printk(KERN_ERR "IOMMU: dmar init failed\n");
2278 put_iova_domain(&reserved_iova_list);
2279 iommu_exit_mempool();
2283 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2286 dma_ops = &intel_dma_ops;