2 * Copyright (c) 2006, Intel Corporation.
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * You should have received a copy of the GNU General Public License along with
14 * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15 * Place - Suite 330, Boston, MA 02111-1307 USA.
17 * Copyright (C) Ashok Raj <ashok.raj@intel.com>
18 * Copyright (C) Shaohua Li <shaohua.li@intel.com>
19 * Copyright (C) Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
22 #include <linux/init.h>
23 #include <linux/bitmap.h>
24 #include <linux/slab.h>
25 #include <linux/irq.h>
26 #include <linux/interrupt.h>
27 #include <linux/sysdev.h>
28 #include <linux/spinlock.h>
29 #include <linux/pci.h>
30 #include <linux/dmar.h>
31 #include <linux/dma-mapping.h>
32 #include <linux/mempool.h>
34 #include "intel-iommu.h"
35 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
36 #include <asm/cacheflush.h>
37 #include <asm/iommu.h>
40 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
41 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
43 #define IOAPIC_RANGE_START (0xfee00000)
44 #define IOAPIC_RANGE_END (0xfeefffff)
45 #define IOVA_START_ADDR (0x1000)
47 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
49 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
51 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53 static void domain_remove_dev_info(struct dmar_domain *domain);
55 static int dmar_disabled;
56 static int __initdata dmar_map_gfx = 1;
57 static int dmar_forcedac;
59 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
60 static DEFINE_SPINLOCK(device_domain_lock);
61 static LIST_HEAD(device_domain_list);
63 static int __init intel_iommu_setup(char *str)
68 if (!strncmp(str, "off", 3)) {
70 printk(KERN_INFO"Intel-IOMMU: disabled\n");
71 } else if (!strncmp(str, "igfx_off", 8)) {
74 "Intel-IOMMU: disable GFX device mapping\n");
75 } else if (!strncmp(str, "forcedac", 8)) {
77 "Intel-IOMMU: Forcing DAC for PCI devices\n");
81 str += strcspn(str, ",");
87 __setup("intel_iommu=", intel_iommu_setup);
89 static struct kmem_cache *iommu_domain_cache;
90 static struct kmem_cache *iommu_devinfo_cache;
91 static struct kmem_cache *iommu_iova_cache;
93 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
98 /* trying to avoid low memory issues */
99 flags = current->flags & PF_MEMALLOC;
100 current->flags |= PF_MEMALLOC;
101 vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
102 current->flags &= (~PF_MEMALLOC | flags);
107 static inline void *alloc_pgtable_page(void)
112 /* trying to avoid low memory issues */
113 flags = current->flags & PF_MEMALLOC;
114 current->flags |= PF_MEMALLOC;
115 vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
116 current->flags &= (~PF_MEMALLOC | flags);
120 static inline void free_pgtable_page(void *vaddr)
122 free_page((unsigned long)vaddr);
125 static inline void *alloc_domain_mem(void)
127 return iommu_kmem_cache_alloc(iommu_domain_cache);
130 static inline void free_domain_mem(void *vaddr)
132 kmem_cache_free(iommu_domain_cache, vaddr);
135 static inline void * alloc_devinfo_mem(void)
137 return iommu_kmem_cache_alloc(iommu_devinfo_cache);
140 static inline void free_devinfo_mem(void *vaddr)
142 kmem_cache_free(iommu_devinfo_cache, vaddr);
145 struct iova *alloc_iova_mem(void)
147 return iommu_kmem_cache_alloc(iommu_iova_cache);
150 void free_iova_mem(struct iova *iova)
152 kmem_cache_free(iommu_iova_cache, iova);
155 static inline void __iommu_flush_cache(
156 struct intel_iommu *iommu, void *addr, int size)
158 if (!ecap_coherent(iommu->ecap))
159 clflush_cache_range(addr, size);
162 /* Gets context entry for a given bus and devfn */
163 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
166 struct root_entry *root;
167 struct context_entry *context;
168 unsigned long phy_addr;
171 spin_lock_irqsave(&iommu->lock, flags);
172 root = &iommu->root_entry[bus];
173 context = get_context_addr_from_root(root);
175 context = (struct context_entry *)alloc_pgtable_page();
177 spin_unlock_irqrestore(&iommu->lock, flags);
180 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
181 phy_addr = virt_to_phys((void *)context);
182 set_root_value(root, phy_addr);
183 set_root_present(root);
184 __iommu_flush_cache(iommu, root, sizeof(*root));
186 spin_unlock_irqrestore(&iommu->lock, flags);
187 return &context[devfn];
190 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
192 struct root_entry *root;
193 struct context_entry *context;
197 spin_lock_irqsave(&iommu->lock, flags);
198 root = &iommu->root_entry[bus];
199 context = get_context_addr_from_root(root);
204 ret = context_present(context[devfn]);
206 spin_unlock_irqrestore(&iommu->lock, flags);
210 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
212 struct root_entry *root;
213 struct context_entry *context;
216 spin_lock_irqsave(&iommu->lock, flags);
217 root = &iommu->root_entry[bus];
218 context = get_context_addr_from_root(root);
220 context_clear_entry(context[devfn]);
221 __iommu_flush_cache(iommu, &context[devfn], \
224 spin_unlock_irqrestore(&iommu->lock, flags);
227 static void free_context_table(struct intel_iommu *iommu)
229 struct root_entry *root;
232 struct context_entry *context;
234 spin_lock_irqsave(&iommu->lock, flags);
235 if (!iommu->root_entry) {
238 for (i = 0; i < ROOT_ENTRY_NR; i++) {
239 root = &iommu->root_entry[i];
240 context = get_context_addr_from_root(root);
242 free_pgtable_page(context);
244 free_pgtable_page(iommu->root_entry);
245 iommu->root_entry = NULL;
247 spin_unlock_irqrestore(&iommu->lock, flags);
250 /* page table handling */
251 #define LEVEL_STRIDE (9)
252 #define LEVEL_MASK (((u64)1 << LEVEL_STRIDE) - 1)
254 static inline int agaw_to_level(int agaw)
259 static inline int agaw_to_width(int agaw)
261 return 30 + agaw * LEVEL_STRIDE;
265 static inline int width_to_agaw(int width)
267 return (width - 30) / LEVEL_STRIDE;
270 static inline unsigned int level_to_offset_bits(int level)
272 return (12 + (level - 1) * LEVEL_STRIDE);
275 static inline int address_level_offset(u64 addr, int level)
277 return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
280 static inline u64 level_mask(int level)
282 return ((u64)-1 << level_to_offset_bits(level));
285 static inline u64 level_size(int level)
287 return ((u64)1 << level_to_offset_bits(level));
290 static inline u64 align_to_level(u64 addr, int level)
292 return ((addr + level_size(level) - 1) & level_mask(level));
295 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
297 int addr_width = agaw_to_width(domain->agaw);
298 struct dma_pte *parent, *pte = NULL;
299 int level = agaw_to_level(domain->agaw);
303 BUG_ON(!domain->pgd);
305 addr &= (((u64)1) << addr_width) - 1;
306 parent = domain->pgd;
308 spin_lock_irqsave(&domain->mapping_lock, flags);
312 offset = address_level_offset(addr, level);
313 pte = &parent[offset];
317 if (!dma_pte_present(*pte)) {
318 tmp_page = alloc_pgtable_page();
321 spin_unlock_irqrestore(&domain->mapping_lock,
325 __iommu_flush_cache(domain->iommu, tmp_page,
327 dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
329 * high level table always sets r/w, last level page
330 * table control read/write
332 dma_set_pte_readable(*pte);
333 dma_set_pte_writable(*pte);
334 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
336 parent = phys_to_virt(dma_pte_addr(*pte));
340 spin_unlock_irqrestore(&domain->mapping_lock, flags);
344 /* return address's pte at specific level */
345 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
348 struct dma_pte *parent, *pte = NULL;
349 int total = agaw_to_level(domain->agaw);
352 parent = domain->pgd;
353 while (level <= total) {
354 offset = address_level_offset(addr, total);
355 pte = &parent[offset];
359 if (!dma_pte_present(*pte))
361 parent = phys_to_virt(dma_pte_addr(*pte));
367 /* clear one page's page table */
368 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
370 struct dma_pte *pte = NULL;
372 /* get last level pte */
373 pte = dma_addr_level_pte(domain, addr, 1);
377 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
381 /* clear last level pte, a tlb flush should be followed */
382 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
384 int addr_width = agaw_to_width(domain->agaw);
386 start &= (((u64)1) << addr_width) - 1;
387 end &= (((u64)1) << addr_width) - 1;
388 /* in case it's partial page */
389 start = PAGE_ALIGN_4K(start);
392 /* we don't need lock here, nobody else touches the iova range */
393 while (start < end) {
394 dma_pte_clear_one(domain, start);
395 start += PAGE_SIZE_4K;
399 /* free page table pages. last level pte should already be cleared */
400 static void dma_pte_free_pagetable(struct dmar_domain *domain,
403 int addr_width = agaw_to_width(domain->agaw);
405 int total = agaw_to_level(domain->agaw);
409 start &= (((u64)1) << addr_width) - 1;
410 end &= (((u64)1) << addr_width) - 1;
412 /* we don't need lock here, nobody else touches the iova range */
414 while (level <= total) {
415 tmp = align_to_level(start, level);
416 if (tmp >= end || (tmp + level_size(level) > end))
420 pte = dma_addr_level_pte(domain, tmp, level);
423 phys_to_virt(dma_pte_addr(*pte)));
425 __iommu_flush_cache(domain->iommu,
428 tmp += level_size(level);
433 if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
434 free_pgtable_page(domain->pgd);
440 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
442 struct root_entry *root;
445 root = (struct root_entry *)alloc_pgtable_page();
449 __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
451 spin_lock_irqsave(&iommu->lock, flags);
452 iommu->root_entry = root;
453 spin_unlock_irqrestore(&iommu->lock, flags);
458 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
460 unsigned long start_time = jiffies;\
462 sts = op (iommu->reg + offset);\
465 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
466 panic("DMAR hardware is malfunctioning\n");\
471 static void iommu_set_root_entry(struct intel_iommu *iommu)
477 addr = iommu->root_entry;
479 spin_lock_irqsave(&iommu->register_lock, flag);
480 dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
482 cmd = iommu->gcmd | DMA_GCMD_SRTP;
483 writel(cmd, iommu->reg + DMAR_GCMD_REG);
485 /* Make sure hardware complete it */
486 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
487 readl, (sts & DMA_GSTS_RTPS), sts);
489 spin_unlock_irqrestore(&iommu->register_lock, flag);
492 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
497 if (!cap_rwbf(iommu->cap))
499 val = iommu->gcmd | DMA_GCMD_WBF;
501 spin_lock_irqsave(&iommu->register_lock, flag);
502 writel(val, iommu->reg + DMAR_GCMD_REG);
504 /* Make sure hardware complete it */
505 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
506 readl, (!(val & DMA_GSTS_WBFS)), val);
508 spin_unlock_irqrestore(&iommu->register_lock, flag);
511 /* return value determine if we need a write buffer flush */
512 static int __iommu_flush_context(struct intel_iommu *iommu,
513 u16 did, u16 source_id, u8 function_mask, u64 type,
514 int non_present_entry_flush)
520 * In the non-present entry flush case, if hardware doesn't cache
521 * non-present entry we do nothing and if hardware cache non-present
522 * entry, we flush entries of domain 0 (the domain id is used to cache
523 * any non-present entries)
525 if (non_present_entry_flush) {
526 if (!cap_caching_mode(iommu->cap))
533 case DMA_CCMD_GLOBAL_INVL:
534 val = DMA_CCMD_GLOBAL_INVL;
536 case DMA_CCMD_DOMAIN_INVL:
537 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
539 case DMA_CCMD_DEVICE_INVL:
540 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
541 | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
548 spin_lock_irqsave(&iommu->register_lock, flag);
549 dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
551 /* Make sure hardware complete it */
552 IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
553 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
555 spin_unlock_irqrestore(&iommu->register_lock, flag);
557 /* flush context entry will implictly flush write buffer */
561 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
562 int non_present_entry_flush)
564 return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
565 non_present_entry_flush);
568 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
569 int non_present_entry_flush)
571 return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
572 non_present_entry_flush);
575 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
576 u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
578 return __iommu_flush_context(iommu, did, source_id, function_mask,
579 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
582 /* return value determine if we need a write buffer flush */
583 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
584 u64 addr, unsigned int size_order, u64 type,
585 int non_present_entry_flush)
587 int tlb_offset = ecap_iotlb_offset(iommu->ecap);
588 u64 val = 0, val_iva = 0;
592 * In the non-present entry flush case, if hardware doesn't cache
593 * non-present entry we do nothing and if hardware cache non-present
594 * entry, we flush entries of domain 0 (the domain id is used to cache
595 * any non-present entries)
597 if (non_present_entry_flush) {
598 if (!cap_caching_mode(iommu->cap))
605 case DMA_TLB_GLOBAL_FLUSH:
606 /* global flush doesn't need set IVA_REG */
607 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
609 case DMA_TLB_DSI_FLUSH:
610 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
612 case DMA_TLB_PSI_FLUSH:
613 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
614 /* Note: always flush non-leaf currently */
615 val_iva = size_order | addr;
620 /* Note: set drain read/write */
623 * This is probably to be super secure.. Looks like we can
624 * ignore it without any impact.
626 if (cap_read_drain(iommu->cap))
627 val |= DMA_TLB_READ_DRAIN;
629 if (cap_write_drain(iommu->cap))
630 val |= DMA_TLB_WRITE_DRAIN;
632 spin_lock_irqsave(&iommu->register_lock, flag);
633 /* Note: Only uses first TLB reg currently */
635 dmar_writeq(iommu->reg + tlb_offset, val_iva);
636 dmar_writeq(iommu->reg + tlb_offset + 8, val);
638 /* Make sure hardware complete it */
639 IOMMU_WAIT_OP(iommu, tlb_offset + 8,
640 dmar_readq, (!(val & DMA_TLB_IVT)), val);
642 spin_unlock_irqrestore(&iommu->register_lock, flag);
644 /* check IOTLB invalidation granularity */
645 if (DMA_TLB_IAIG(val) == 0)
646 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
647 if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
648 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
649 DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
650 /* flush context entry will implictly flush write buffer */
654 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
655 int non_present_entry_flush)
657 return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
658 non_present_entry_flush);
661 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
662 int non_present_entry_flush)
664 return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
665 non_present_entry_flush);
668 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
669 u64 addr, unsigned int pages, int non_present_entry_flush)
673 BUG_ON(addr & (~PAGE_MASK_4K));
676 /* Fallback to domain selective flush if no PSI support */
677 if (!cap_pgsel_inv(iommu->cap))
678 return iommu_flush_iotlb_dsi(iommu, did,
679 non_present_entry_flush);
682 * PSI requires page size to be 2 ^ x, and the base address is naturally
683 * aligned to the size
685 mask = ilog2(__roundup_pow_of_two(pages));
686 /* Fallback to domain selective flush if size is too big */
687 if (mask > cap_max_amask_val(iommu->cap))
688 return iommu_flush_iotlb_dsi(iommu, did,
689 non_present_entry_flush);
691 return __iommu_flush_iotlb(iommu, did, addr, mask,
692 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
695 static int iommu_enable_translation(struct intel_iommu *iommu)
700 spin_lock_irqsave(&iommu->register_lock, flags);
701 writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
703 /* Make sure hardware complete it */
704 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
705 readl, (sts & DMA_GSTS_TES), sts);
707 iommu->gcmd |= DMA_GCMD_TE;
708 spin_unlock_irqrestore(&iommu->register_lock, flags);
712 static int iommu_disable_translation(struct intel_iommu *iommu)
717 spin_lock_irqsave(&iommu->register_lock, flag);
718 iommu->gcmd &= ~DMA_GCMD_TE;
719 writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
721 /* Make sure hardware complete it */
722 IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723 readl, (!(sts & DMA_GSTS_TES)), sts);
725 spin_unlock_irqrestore(&iommu->register_lock, flag);
729 /* iommu interrupt handling. Most stuff are MSI-like. */
731 static char *fault_reason_strings[] =
734 "Present bit in root entry is clear",
735 "Present bit in context entry is clear",
736 "Invalid context entry",
737 "Access beyond MGAW",
738 "PTE Write access is not set",
739 "PTE Read access is not set",
740 "Next page table ptr is invalid",
741 "Root table address invalid",
742 "Context table ptr is invalid",
743 "non-zero reserved fields in RTP",
744 "non-zero reserved fields in CTP",
745 "non-zero reserved fields in PTE",
748 #define MAX_FAULT_REASON_IDX ARRAY_SIZE(fault_reason_strings)
750 char *dmar_get_fault_reason(u8 fault_reason)
752 if (fault_reason > MAX_FAULT_REASON_IDX)
753 return fault_reason_strings[MAX_FAULT_REASON_IDX];
755 return fault_reason_strings[fault_reason];
758 void dmar_msi_unmask(unsigned int irq)
760 struct intel_iommu *iommu = get_irq_data(irq);
764 spin_lock_irqsave(&iommu->register_lock, flag);
765 writel(0, iommu->reg + DMAR_FECTL_REG);
766 /* Read a reg to force flush the post write */
767 readl(iommu->reg + DMAR_FECTL_REG);
768 spin_unlock_irqrestore(&iommu->register_lock, flag);
771 void dmar_msi_mask(unsigned int irq)
774 struct intel_iommu *iommu = get_irq_data(irq);
777 spin_lock_irqsave(&iommu->register_lock, flag);
778 writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
779 /* Read a reg to force flush the post write */
780 readl(iommu->reg + DMAR_FECTL_REG);
781 spin_unlock_irqrestore(&iommu->register_lock, flag);
784 void dmar_msi_write(int irq, struct msi_msg *msg)
786 struct intel_iommu *iommu = get_irq_data(irq);
789 spin_lock_irqsave(&iommu->register_lock, flag);
790 writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
791 writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
792 writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
793 spin_unlock_irqrestore(&iommu->register_lock, flag);
796 void dmar_msi_read(int irq, struct msi_msg *msg)
798 struct intel_iommu *iommu = get_irq_data(irq);
801 spin_lock_irqsave(&iommu->register_lock, flag);
802 msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
803 msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
804 msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
805 spin_unlock_irqrestore(&iommu->register_lock, flag);
808 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
809 u8 fault_reason, u16 source_id, u64 addr)
813 reason = dmar_get_fault_reason(fault_reason);
816 "DMAR:[%s] Request device [%02x:%02x.%d] "
818 "DMAR:[fault reason %02d] %s\n",
819 (type ? "DMA Read" : "DMA Write"),
820 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
821 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
825 #define PRIMARY_FAULT_REG_LEN (16)
826 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
828 struct intel_iommu *iommu = dev_id;
829 int reg, fault_index;
833 spin_lock_irqsave(&iommu->register_lock, flag);
834 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
836 /* TBD: ignore advanced fault log currently */
837 if (!(fault_status & DMA_FSTS_PPF))
840 fault_index = dma_fsts_fault_record_index(fault_status);
841 reg = cap_fault_reg_offset(iommu->cap);
849 /* highest 32 bits */
850 data = readl(iommu->reg + reg +
851 fault_index * PRIMARY_FAULT_REG_LEN + 12);
852 if (!(data & DMA_FRCD_F))
855 fault_reason = dma_frcd_fault_reason(data);
856 type = dma_frcd_type(data);
858 data = readl(iommu->reg + reg +
859 fault_index * PRIMARY_FAULT_REG_LEN + 8);
860 source_id = dma_frcd_source_id(data);
862 guest_addr = dmar_readq(iommu->reg + reg +
863 fault_index * PRIMARY_FAULT_REG_LEN);
864 guest_addr = dma_frcd_page_addr(guest_addr);
865 /* clear the fault */
866 writel(DMA_FRCD_F, iommu->reg + reg +
867 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869 spin_unlock_irqrestore(&iommu->register_lock, flag);
871 iommu_page_fault_do_one(iommu, type, fault_reason,
872 source_id, guest_addr);
875 if (fault_index > cap_num_fault_regs(iommu->cap))
877 spin_lock_irqsave(&iommu->register_lock, flag);
880 /* clear primary fault overflow */
881 fault_status = readl(iommu->reg + DMAR_FSTS_REG);
882 if (fault_status & DMA_FSTS_PFO)
883 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
885 spin_unlock_irqrestore(&iommu->register_lock, flag);
889 int dmar_set_interrupt(struct intel_iommu *iommu)
895 printk(KERN_ERR "IOMMU: no free vectors\n");
899 set_irq_data(irq, iommu);
902 ret = arch_setup_dmar_msi(irq);
904 set_irq_data(irq, NULL);
910 /* Force fault register is cleared */
911 iommu_page_fault(irq, iommu);
913 ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
915 printk(KERN_ERR "IOMMU: can't request irq\n");
919 static int iommu_init_domains(struct intel_iommu *iommu)
921 unsigned long ndomains;
922 unsigned long nlongs;
924 ndomains = cap_ndoms(iommu->cap);
925 pr_debug("Number of Domains supportd <%ld>\n", ndomains);
926 nlongs = BITS_TO_LONGS(ndomains);
928 /* TBD: there might be 64K domains,
929 * consider other allocation for future chip
931 iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
932 if (!iommu->domain_ids) {
933 printk(KERN_ERR "Allocating domain id array failed\n");
936 iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
938 if (!iommu->domains) {
939 printk(KERN_ERR "Allocating domain array failed\n");
940 kfree(iommu->domain_ids);
945 * if Caching mode is set, then invalid translations are tagged
946 * with domainid 0. Hence we need to pre-allocate it.
948 if (cap_caching_mode(iommu->cap))
949 set_bit(0, iommu->domain_ids);
953 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
955 struct intel_iommu *iommu;
960 iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
963 iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
965 printk(KERN_ERR "IOMMU: can't map the region\n");
968 iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
969 iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
971 /* the registers might be more than one page */
972 map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
973 cap_max_fault_reg_offset(iommu->cap));
974 map_size = PAGE_ALIGN_4K(map_size);
975 if (map_size > PAGE_SIZE_4K) {
977 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
979 printk(KERN_ERR "IOMMU: can't map the region\n");
984 ver = readl(iommu->reg + DMAR_VER_REG);
985 pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
986 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
987 iommu->cap, iommu->ecap);
988 ret = iommu_init_domains(iommu);
991 spin_lock_init(&iommu->lock);
992 spin_lock_init(&iommu->register_lock);
1004 static void domain_exit(struct dmar_domain *domain);
1005 static void free_iommu(struct intel_iommu *iommu)
1007 struct dmar_domain *domain;
1013 i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1014 for (; i < cap_ndoms(iommu->cap); ) {
1015 domain = iommu->domains[i];
1016 clear_bit(i, iommu->domain_ids);
1017 domain_exit(domain);
1018 i = find_next_bit(iommu->domain_ids,
1019 cap_ndoms(iommu->cap), i+1);
1022 if (iommu->gcmd & DMA_GCMD_TE)
1023 iommu_disable_translation(iommu);
1026 set_irq_data(iommu->irq, NULL);
1027 /* This will mask the irq */
1028 free_irq(iommu->irq, iommu);
1029 destroy_irq(iommu->irq);
1032 kfree(iommu->domains);
1033 kfree(iommu->domain_ids);
1035 /* free context mapping */
1036 free_context_table(iommu);
1039 iounmap(iommu->reg);
1043 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1046 unsigned long ndomains;
1047 struct dmar_domain *domain;
1048 unsigned long flags;
1050 domain = alloc_domain_mem();
1054 ndomains = cap_ndoms(iommu->cap);
1056 spin_lock_irqsave(&iommu->lock, flags);
1057 num = find_first_zero_bit(iommu->domain_ids, ndomains);
1058 if (num >= ndomains) {
1059 spin_unlock_irqrestore(&iommu->lock, flags);
1060 free_domain_mem(domain);
1061 printk(KERN_ERR "IOMMU: no free domain ids\n");
1065 set_bit(num, iommu->domain_ids);
1067 domain->iommu = iommu;
1068 iommu->domains[num] = domain;
1069 spin_unlock_irqrestore(&iommu->lock, flags);
1074 static void iommu_free_domain(struct dmar_domain *domain)
1076 unsigned long flags;
1078 spin_lock_irqsave(&domain->iommu->lock, flags);
1079 clear_bit(domain->id, domain->iommu->domain_ids);
1080 spin_unlock_irqrestore(&domain->iommu->lock, flags);
1083 static struct iova_domain reserved_iova_list;
1085 static void dmar_init_reserved_ranges(void)
1087 struct pci_dev *pdev = NULL;
1092 init_iova_domain(&reserved_iova_list);
1094 /* IOAPIC ranges shouldn't be accessed by DMA */
1095 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1096 IOVA_PFN(IOAPIC_RANGE_END));
1098 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1100 /* Reserve all PCI MMIO to avoid peer-to-peer access */
1101 for_each_pci_dev(pdev) {
1104 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1105 r = &pdev->resource[i];
1106 if (!r->flags || !(r->flags & IORESOURCE_MEM))
1109 addr &= PAGE_MASK_4K;
1110 size = r->end - addr;
1111 size = PAGE_ALIGN_4K(size);
1112 iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1113 IOVA_PFN(size + addr) - 1);
1115 printk(KERN_ERR "Reserve iova failed\n");
1121 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1123 copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1126 static inline int guestwidth_to_adjustwidth(int gaw)
1129 int r = (gaw - 12) % 9;
1140 static int domain_init(struct dmar_domain *domain, int guest_width)
1142 struct intel_iommu *iommu;
1143 int adjust_width, agaw;
1144 unsigned long sagaw;
1146 init_iova_domain(&domain->iovad);
1147 spin_lock_init(&domain->mapping_lock);
1149 domain_reserve_special_ranges(domain);
1151 /* calculate AGAW */
1152 iommu = domain->iommu;
1153 if (guest_width > cap_mgaw(iommu->cap))
1154 guest_width = cap_mgaw(iommu->cap);
1155 domain->gaw = guest_width;
1156 adjust_width = guestwidth_to_adjustwidth(guest_width);
1157 agaw = width_to_agaw(adjust_width);
1158 sagaw = cap_sagaw(iommu->cap);
1159 if (!test_bit(agaw, &sagaw)) {
1160 /* hardware doesn't support it, choose a bigger one */
1161 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1162 agaw = find_next_bit(&sagaw, 5, agaw);
1166 domain->agaw = agaw;
1167 INIT_LIST_HEAD(&domain->devices);
1169 /* always allocate the top pgd */
1170 domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1173 __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1177 static void domain_exit(struct dmar_domain *domain)
1181 /* Domain 0 is reserved, so dont process it */
1185 domain_remove_dev_info(domain);
1187 put_iova_domain(&domain->iovad);
1188 end = DOMAIN_MAX_ADDR(domain->gaw);
1189 end = end & (~PAGE_MASK_4K);
1192 dma_pte_clear_range(domain, 0, end);
1194 /* free page tables */
1195 dma_pte_free_pagetable(domain, 0, end);
1197 iommu_free_domain(domain);
1198 free_domain_mem(domain);
1201 static int domain_context_mapping_one(struct dmar_domain *domain,
1204 struct context_entry *context;
1205 struct intel_iommu *iommu = domain->iommu;
1206 unsigned long flags;
1208 pr_debug("Set context mapping for %02x:%02x.%d\n",
1209 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1210 BUG_ON(!domain->pgd);
1211 context = device_to_context_entry(iommu, bus, devfn);
1214 spin_lock_irqsave(&iommu->lock, flags);
1215 if (context_present(*context)) {
1216 spin_unlock_irqrestore(&iommu->lock, flags);
1220 context_set_domain_id(*context, domain->id);
1221 context_set_address_width(*context, domain->agaw);
1222 context_set_address_root(*context, virt_to_phys(domain->pgd));
1223 context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1224 context_set_fault_enable(*context);
1225 context_set_present(*context);
1226 __iommu_flush_cache(iommu, context, sizeof(*context));
1228 /* it's a non-present to present mapping */
1229 if (iommu_flush_context_device(iommu, domain->id,
1230 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1231 iommu_flush_write_buffer(iommu);
1233 iommu_flush_iotlb_dsi(iommu, 0, 0);
1234 spin_unlock_irqrestore(&iommu->lock, flags);
1239 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1242 struct pci_dev *tmp, *parent;
1244 ret = domain_context_mapping_one(domain, pdev->bus->number,
1249 /* dependent device mapping */
1250 tmp = pci_find_upstream_pcie_bridge(pdev);
1253 /* Secondary interface's bus number and devfn 0 */
1254 parent = pdev->bus->self;
1255 while (parent != tmp) {
1256 ret = domain_context_mapping_one(domain, parent->bus->number,
1260 parent = parent->bus->self;
1262 if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1263 return domain_context_mapping_one(domain,
1264 tmp->subordinate->number, 0);
1265 else /* this is a legacy PCI bridge */
1266 return domain_context_mapping_one(domain,
1267 tmp->bus->number, tmp->devfn);
1270 static int domain_context_mapped(struct dmar_domain *domain,
1271 struct pci_dev *pdev)
1274 struct pci_dev *tmp, *parent;
1276 ret = device_context_mapped(domain->iommu,
1277 pdev->bus->number, pdev->devfn);
1280 /* dependent device mapping */
1281 tmp = pci_find_upstream_pcie_bridge(pdev);
1284 /* Secondary interface's bus number and devfn 0 */
1285 parent = pdev->bus->self;
1286 while (parent != tmp) {
1287 ret = device_context_mapped(domain->iommu, parent->bus->number,
1291 parent = parent->bus->self;
1294 return device_context_mapped(domain->iommu,
1295 tmp->subordinate->number, 0);
1297 return device_context_mapped(domain->iommu,
1298 tmp->bus->number, tmp->devfn);
1302 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1303 u64 hpa, size_t size, int prot)
1305 u64 start_pfn, end_pfn;
1306 struct dma_pte *pte;
1309 if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1311 iova &= PAGE_MASK_4K;
1312 start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1313 end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1315 while (start_pfn < end_pfn) {
1316 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1319 /* We don't need lock here, nobody else
1320 * touches the iova range
1322 BUG_ON(dma_pte_addr(*pte));
1323 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1324 dma_set_pte_prot(*pte, prot);
1325 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1332 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1334 clear_context_table(domain->iommu, bus, devfn);
1335 iommu_flush_context_global(domain->iommu, 0);
1336 iommu_flush_iotlb_global(domain->iommu, 0);
1339 static void domain_remove_dev_info(struct dmar_domain *domain)
1341 struct device_domain_info *info;
1342 unsigned long flags;
1344 spin_lock_irqsave(&device_domain_lock, flags);
1345 while (!list_empty(&domain->devices)) {
1346 info = list_entry(domain->devices.next,
1347 struct device_domain_info, link);
1348 list_del(&info->link);
1349 list_del(&info->global);
1351 info->dev->dev.archdata.iommu = NULL;
1352 spin_unlock_irqrestore(&device_domain_lock, flags);
1354 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1355 free_devinfo_mem(info);
1357 spin_lock_irqsave(&device_domain_lock, flags);
1359 spin_unlock_irqrestore(&device_domain_lock, flags);
1364 * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1366 struct dmar_domain *
1367 find_domain(struct pci_dev *pdev)
1369 struct device_domain_info *info;
1371 /* No lock here, assumes no domain exit in normal case */
1372 info = pdev->dev.archdata.iommu;
1374 return info->domain;
1378 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1379 struct pci_dev *dev)
1384 for (index = 0; index < cnt; index ++)
1385 if (dev == devices[index])
1388 /* Check our parent */
1389 dev = dev->bus->self;
1395 static struct dmar_drhd_unit *
1396 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1398 struct dmar_drhd_unit *drhd = NULL;
1400 list_for_each_entry(drhd, &dmar_drhd_units, list) {
1401 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1402 drhd->devices_cnt, dev))
1409 /* domain is initialized */
1410 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1412 struct dmar_domain *domain, *found = NULL;
1413 struct intel_iommu *iommu;
1414 struct dmar_drhd_unit *drhd;
1415 struct device_domain_info *info, *tmp;
1416 struct pci_dev *dev_tmp;
1417 unsigned long flags;
1418 int bus = 0, devfn = 0;
1420 domain = find_domain(pdev);
1424 dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1426 if (dev_tmp->is_pcie) {
1427 bus = dev_tmp->subordinate->number;
1430 bus = dev_tmp->bus->number;
1431 devfn = dev_tmp->devfn;
1433 spin_lock_irqsave(&device_domain_lock, flags);
1434 list_for_each_entry(info, &device_domain_list, global) {
1435 if (info->bus == bus && info->devfn == devfn) {
1436 found = info->domain;
1440 spin_unlock_irqrestore(&device_domain_lock, flags);
1441 /* pcie-pci bridge already has a domain, uses it */
1448 /* Allocate new domain for the device */
1449 drhd = dmar_find_matched_drhd_unit(pdev);
1451 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1455 iommu = drhd->iommu;
1457 domain = iommu_alloc_domain(iommu);
1461 if (domain_init(domain, gaw)) {
1462 domain_exit(domain);
1466 /* register pcie-to-pci device */
1468 info = alloc_devinfo_mem();
1470 domain_exit(domain);
1474 info->devfn = devfn;
1476 info->domain = domain;
1477 /* This domain is shared by devices under p2p bridge */
1478 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1480 /* pcie-to-pci bridge already has a domain, uses it */
1482 spin_lock_irqsave(&device_domain_lock, flags);
1483 list_for_each_entry(tmp, &device_domain_list, global) {
1484 if (tmp->bus == bus && tmp->devfn == devfn) {
1485 found = tmp->domain;
1490 free_devinfo_mem(info);
1491 domain_exit(domain);
1494 list_add(&info->link, &domain->devices);
1495 list_add(&info->global, &device_domain_list);
1497 spin_unlock_irqrestore(&device_domain_lock, flags);
1501 info = alloc_devinfo_mem();
1504 info->bus = pdev->bus->number;
1505 info->devfn = pdev->devfn;
1507 info->domain = domain;
1508 spin_lock_irqsave(&device_domain_lock, flags);
1509 /* somebody is fast */
1510 found = find_domain(pdev);
1511 if (found != NULL) {
1512 spin_unlock_irqrestore(&device_domain_lock, flags);
1513 if (found != domain) {
1514 domain_exit(domain);
1517 free_devinfo_mem(info);
1520 list_add(&info->link, &domain->devices);
1521 list_add(&info->global, &device_domain_list);
1522 pdev->dev.archdata.iommu = info;
1523 spin_unlock_irqrestore(&device_domain_lock, flags);
1526 /* recheck it here, maybe others set it */
1527 return find_domain(pdev);
1530 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1532 struct dmar_domain *domain;
1538 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1539 pci_name(pdev), start, end);
1540 /* page table init */
1541 domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1545 /* The address might not be aligned */
1546 base = start & PAGE_MASK_4K;
1548 size = PAGE_ALIGN_4K(size);
1549 if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1550 IOVA_PFN(base + size) - 1)) {
1551 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1556 pr_debug("Mapping reserved region %lx@%llx for %s\n",
1557 size, base, pci_name(pdev));
1559 * RMRR range might have overlap with physical memory range,
1562 dma_pte_clear_range(domain, base, base + size);
1564 ret = domain_page_mapping(domain, base, base, size,
1565 DMA_PTE_READ|DMA_PTE_WRITE);
1569 /* context entry init */
1570 ret = domain_context_mapping(domain, pdev);
1574 domain_exit(domain);
1579 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1580 struct pci_dev *pdev)
1582 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1584 return iommu_prepare_identity_map(pdev, rmrr->base_address,
1585 rmrr->end_address + 1);
1588 #ifdef CONFIG_DMAR_GFX_WA
1589 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1590 static void __init iommu_prepare_gfx_mapping(void)
1592 struct pci_dev *pdev = NULL;
1597 for_each_pci_dev(pdev) {
1598 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1599 !IS_GFX_DEVICE(pdev))
1601 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1603 slot = arch_get_ram_range(0, &base, &size);
1605 ret = iommu_prepare_identity_map(pdev,
1609 slot = arch_get_ram_range(slot, &base, &size);
1613 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1618 #ifdef CONFIG_DMAR_FLOPPY_WA
1619 static inline void iommu_prepare_isa(void)
1621 struct pci_dev *pdev;
1624 pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1628 printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1629 ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1632 printk("IOMMU: Failed to create 0-64M identity map, "
1633 "floppy might not work\n");
1637 static inline void iommu_prepare_isa(void)
1641 #endif /* !CONFIG_DMAR_FLPY_WA */
1643 int __init init_dmars(void)
1645 struct dmar_drhd_unit *drhd;
1646 struct dmar_rmrr_unit *rmrr;
1647 struct pci_dev *pdev;
1648 struct intel_iommu *iommu;
1654 * initialize and program root entry to not present
1657 for_each_drhd_unit(drhd) {
1660 iommu = alloc_iommu(drhd);
1668 * we could share the same root & context tables
1669 * amoung all IOMMU's. Need to Split it later.
1671 ret = iommu_alloc_root_entry(iommu);
1673 printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1680 * for each dev attached to rmrr
1682 * locate drhd for dev, alloc domain for dev
1683 * allocate free domain
1684 * allocate page table entries for rmrr
1685 * if context not allocated for bus
1686 * allocate and init context
1687 * set present in root table for this bus
1688 * init context with domain, translation etc
1692 for_each_rmrr_units(rmrr) {
1694 for (i = 0; i < rmrr->devices_cnt; i++) {
1695 pdev = rmrr->devices[i];
1696 /* some BIOS lists non-exist devices in DMAR table */
1699 ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1702 "IOMMU: mapping reserved region failed\n");
1706 iommu_prepare_gfx_mapping();
1708 iommu_prepare_isa();
1713 * global invalidate context cache
1714 * global invalidate iotlb
1715 * enable translation
1717 for_each_drhd_unit(drhd) {
1720 iommu = drhd->iommu;
1721 sprintf (iommu->name, "dmar%d", unit++);
1723 iommu_flush_write_buffer(iommu);
1725 ret = dmar_set_interrupt(iommu);
1729 iommu_set_root_entry(iommu);
1731 iommu_flush_context_global(iommu, 0);
1732 iommu_flush_iotlb_global(iommu, 0);
1734 ret = iommu_enable_translation(iommu);
1741 for_each_drhd_unit(drhd) {
1744 iommu = drhd->iommu;
1750 static inline u64 aligned_size(u64 host_addr, size_t size)
1753 addr = (host_addr & (~PAGE_MASK_4K)) + size;
1754 return PAGE_ALIGN_4K(addr);
1758 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1762 /* Make sure it's in range */
1763 end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1764 if (!size || (IOVA_START_ADDR + size > end))
1767 piova = alloc_iova(&domain->iovad,
1768 size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1772 static struct iova *
1773 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1776 struct pci_dev *pdev = to_pci_dev(dev);
1777 struct iova *iova = NULL;
1779 if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1780 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1783 * First try to allocate an io virtual address in
1784 * DMA_32BIT_MASK and if that fails then try allocating
1787 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1789 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1793 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1800 static struct dmar_domain *
1801 get_valid_domain_for_dev(struct pci_dev *pdev)
1803 struct dmar_domain *domain;
1806 domain = get_domain_for_dev(pdev,
1807 DEFAULT_DOMAIN_ADDRESS_WIDTH);
1810 "Allocating domain for %s failed", pci_name(pdev));
1814 /* make sure context mapping is ok */
1815 if (unlikely(!domain_context_mapped(domain, pdev))) {
1816 ret = domain_context_mapping(domain, pdev);
1819 "Domain context map for %s failed",
1828 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1829 size_t size, int dir)
1831 struct pci_dev *pdev = to_pci_dev(hwdev);
1833 struct dmar_domain *domain;
1834 unsigned long start_addr;
1838 BUG_ON(dir == DMA_NONE);
1839 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1840 return virt_to_bus(addr);
1842 domain = get_valid_domain_for_dev(pdev);
1846 addr = (void *)virt_to_phys(addr);
1847 size = aligned_size((u64)addr, size);
1849 iova = __intel_alloc_iova(hwdev, domain, size);
1853 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1856 * Check if DMAR supports zero-length reads on write only
1859 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1860 !cap_zlr(domain->iommu->cap))
1861 prot |= DMA_PTE_READ;
1862 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1863 prot |= DMA_PTE_WRITE;
1865 * addr - (addr + size) might be partial page, we should map the whole
1866 * page. Note: if two part of one page are separately mapped, we
1867 * might have two guest_addr mapping to the same host addr, but this
1868 * is not a big problem
1870 ret = domain_page_mapping(domain, start_addr,
1871 ((u64)addr) & PAGE_MASK_4K, size, prot);
1875 pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1876 pci_name(pdev), size, (u64)addr,
1877 size, (u64)start_addr, dir);
1879 /* it's a non-present to present mapping */
1880 ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1881 start_addr, size >> PAGE_SHIFT_4K, 1);
1883 iommu_flush_write_buffer(domain->iommu);
1885 return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1889 __free_iova(&domain->iovad, iova);
1890 printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1891 pci_name(pdev), size, (u64)addr, dir);
1895 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1896 size_t size, int dir)
1898 struct pci_dev *pdev = to_pci_dev(dev);
1899 struct dmar_domain *domain;
1900 unsigned long start_addr;
1903 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1905 domain = find_domain(pdev);
1908 iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1912 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1913 size = aligned_size((u64)dev_addr, size);
1915 pr_debug("Device %s unmapping: %lx@%llx\n",
1916 pci_name(pdev), size, (u64)start_addr);
1918 /* clear the whole page */
1919 dma_pte_clear_range(domain, start_addr, start_addr + size);
1920 /* free page tables */
1921 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1923 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1924 size >> PAGE_SHIFT_4K, 0))
1925 iommu_flush_write_buffer(domain->iommu);
1928 __free_iova(&domain->iovad, iova);
1931 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1932 dma_addr_t *dma_handle, gfp_t flags)
1937 size = PAGE_ALIGN_4K(size);
1938 order = get_order(size);
1939 flags &= ~(GFP_DMA | GFP_DMA32);
1941 vaddr = (void *)__get_free_pages(flags, order);
1944 memset(vaddr, 0, size);
1946 *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1949 free_pages((unsigned long)vaddr, order);
1953 static void intel_free_coherent(struct device *hwdev, size_t size,
1954 void *vaddr, dma_addr_t dma_handle)
1958 size = PAGE_ALIGN_4K(size);
1959 order = get_order(size);
1961 intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1962 free_pages((unsigned long)vaddr, order);
1965 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1966 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1967 int nelems, int dir)
1970 struct pci_dev *pdev = to_pci_dev(hwdev);
1971 struct dmar_domain *domain;
1972 unsigned long start_addr;
1976 struct scatterlist *sg;
1978 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1981 domain = find_domain(pdev);
1983 iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
1986 for_each_sg(sglist, sg, nelems, i) {
1987 addr = SG_ENT_VIRT_ADDRESS(sg);
1988 size += aligned_size((u64)addr, sg->length);
1991 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1993 /* clear the whole page */
1994 dma_pte_clear_range(domain, start_addr, start_addr + size);
1995 /* free page tables */
1996 dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1998 if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1999 size >> PAGE_SHIFT_4K, 0))
2000 iommu_flush_write_buffer(domain->iommu);
2003 __free_iova(&domain->iovad, iova);
2006 static int intel_nontranslate_map_sg(struct device *hddev,
2007 struct scatterlist *sglist, int nelems, int dir)
2010 struct scatterlist *sg;
2012 for_each_sg(sglist, sg, nelems, i) {
2013 BUG_ON(!sg_page(sg));
2014 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2015 sg->dma_length = sg->length;
2020 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2021 int nelems, int dir)
2025 struct pci_dev *pdev = to_pci_dev(hwdev);
2026 struct dmar_domain *domain;
2030 struct iova *iova = NULL;
2032 struct scatterlist *sg;
2033 unsigned long start_addr;
2035 BUG_ON(dir == DMA_NONE);
2036 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2037 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2039 domain = get_valid_domain_for_dev(pdev);
2043 for_each_sg(sglist, sg, nelems, i) {
2044 addr = SG_ENT_VIRT_ADDRESS(sg);
2045 addr = (void *)virt_to_phys(addr);
2046 size += aligned_size((u64)addr, sg->length);
2049 iova = __intel_alloc_iova(hwdev, domain, size);
2051 sglist->dma_length = 0;
2056 * Check if DMAR supports zero-length reads on write only
2059 if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2060 !cap_zlr(domain->iommu->cap))
2061 prot |= DMA_PTE_READ;
2062 if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2063 prot |= DMA_PTE_WRITE;
2065 start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2067 for_each_sg(sglist, sg, nelems, i) {
2068 addr = SG_ENT_VIRT_ADDRESS(sg);
2069 addr = (void *)virt_to_phys(addr);
2070 size = aligned_size((u64)addr, sg->length);
2071 ret = domain_page_mapping(domain, start_addr + offset,
2072 ((u64)addr) & PAGE_MASK_4K,
2075 /* clear the page */
2076 dma_pte_clear_range(domain, start_addr,
2077 start_addr + offset);
2078 /* free page tables */
2079 dma_pte_free_pagetable(domain, start_addr,
2080 start_addr + offset);
2082 __free_iova(&domain->iovad, iova);
2085 sg->dma_address = start_addr + offset +
2086 ((u64)addr & (~PAGE_MASK_4K));
2087 sg->dma_length = sg->length;
2091 /* it's a non-present to present mapping */
2092 if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2093 start_addr, offset >> PAGE_SHIFT_4K, 1))
2094 iommu_flush_write_buffer(domain->iommu);
2098 static struct dma_mapping_ops intel_dma_ops = {
2099 .alloc_coherent = intel_alloc_coherent,
2100 .free_coherent = intel_free_coherent,
2101 .map_single = intel_map_single,
2102 .unmap_single = intel_unmap_single,
2103 .map_sg = intel_map_sg,
2104 .unmap_sg = intel_unmap_sg,
2107 static inline int iommu_domain_cache_init(void)
2111 iommu_domain_cache = kmem_cache_create("iommu_domain",
2112 sizeof(struct dmar_domain),
2117 if (!iommu_domain_cache) {
2118 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2125 static inline int iommu_devinfo_cache_init(void)
2129 iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2130 sizeof(struct device_domain_info),
2135 if (!iommu_devinfo_cache) {
2136 printk(KERN_ERR "Couldn't create devinfo cache\n");
2143 static inline int iommu_iova_cache_init(void)
2147 iommu_iova_cache = kmem_cache_create("iommu_iova",
2148 sizeof(struct iova),
2153 if (!iommu_iova_cache) {
2154 printk(KERN_ERR "Couldn't create iova cache\n");
2161 static int __init iommu_init_mempool(void)
2164 ret = iommu_iova_cache_init();
2168 ret = iommu_domain_cache_init();
2172 ret = iommu_devinfo_cache_init();
2176 kmem_cache_destroy(iommu_domain_cache);
2178 kmem_cache_destroy(iommu_iova_cache);
2183 static void __init iommu_exit_mempool(void)
2185 kmem_cache_destroy(iommu_devinfo_cache);
2186 kmem_cache_destroy(iommu_domain_cache);
2187 kmem_cache_destroy(iommu_iova_cache);
2191 void __init detect_intel_iommu(void)
2193 if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2195 if (early_dmar_detect()) {
2200 static void __init init_no_remapping_devices(void)
2202 struct dmar_drhd_unit *drhd;
2204 for_each_drhd_unit(drhd) {
2205 if (!drhd->include_all) {
2207 for (i = 0; i < drhd->devices_cnt; i++)
2208 if (drhd->devices[i] != NULL)
2210 /* ignore DMAR unit if no pci devices exist */
2211 if (i == drhd->devices_cnt)
2219 for_each_drhd_unit(drhd) {
2221 if (drhd->ignored || drhd->include_all)
2224 for (i = 0; i < drhd->devices_cnt; i++)
2225 if (drhd->devices[i] &&
2226 !IS_GFX_DEVICE(drhd->devices[i]))
2229 if (i < drhd->devices_cnt)
2232 /* bypass IOMMU if it is just for gfx devices */
2234 for (i = 0; i < drhd->devices_cnt; i++) {
2235 if (!drhd->devices[i])
2237 drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2242 int __init intel_iommu_init(void)
2246 if (no_iommu || swiotlb || dmar_disabled)
2249 if (dmar_table_init())
2252 iommu_init_mempool();
2253 dmar_init_reserved_ranges();
2255 init_no_remapping_devices();
2259 printk(KERN_ERR "IOMMU: dmar init failed\n");
2260 put_iova_domain(&reserved_iova_list);
2261 iommu_exit_mempool();
2265 "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2268 dma_ops = &intel_dma_ops;