Merge commit 'v2.6.28-rc6' into x86/debug
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/sysdev.h>
31 #include <linux/spinlock.h>
32 #include <linux/pci.h>
33 #include <linux/dmar.h>
34 #include <linux/dma-mapping.h>
35 #include <linux/mempool.h>
36 #include <linux/timer.h>
37 #include <linux/iova.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57
58 static void flush_unmaps_timeout(unsigned long data);
59
60 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
61
62 #define HIGH_WATER_MARK 250
63 struct deferred_flush_tables {
64         int next;
65         struct iova *iova[HIGH_WATER_MARK];
66         struct dmar_domain *domain[HIGH_WATER_MARK];
67 };
68
69 static struct deferred_flush_tables *deferred_flush;
70
71 /* bitmap for indexing intel_iommus */
72 static int g_num_of_iommus;
73
74 static DEFINE_SPINLOCK(async_umap_flush_lock);
75 static LIST_HEAD(unmaps_to_do);
76
77 static int timer_on;
78 static long list_size;
79
80 static void domain_remove_dev_info(struct dmar_domain *domain);
81
82 int dmar_disabled;
83 static int __initdata dmar_map_gfx = 1;
84 static int dmar_forcedac;
85 static int intel_iommu_strict;
86
87 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
88 static DEFINE_SPINLOCK(device_domain_lock);
89 static LIST_HEAD(device_domain_list);
90
91 static int __init intel_iommu_setup(char *str)
92 {
93         if (!str)
94                 return -EINVAL;
95         while (*str) {
96                 if (!strncmp(str, "off", 3)) {
97                         dmar_disabled = 1;
98                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
99                 } else if (!strncmp(str, "igfx_off", 8)) {
100                         dmar_map_gfx = 0;
101                         printk(KERN_INFO
102                                 "Intel-IOMMU: disable GFX device mapping\n");
103                 } else if (!strncmp(str, "forcedac", 8)) {
104                         printk(KERN_INFO
105                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
106                         dmar_forcedac = 1;
107                 } else if (!strncmp(str, "strict", 6)) {
108                         printk(KERN_INFO
109                                 "Intel-IOMMU: disable batched IOTLB flush\n");
110                         intel_iommu_strict = 1;
111                 }
112
113                 str += strcspn(str, ",");
114                 while (*str == ',')
115                         str++;
116         }
117         return 0;
118 }
119 __setup("intel_iommu=", intel_iommu_setup);
120
121 static struct kmem_cache *iommu_domain_cache;
122 static struct kmem_cache *iommu_devinfo_cache;
123 static struct kmem_cache *iommu_iova_cache;
124
125 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
126 {
127         unsigned int flags;
128         void *vaddr;
129
130         /* trying to avoid low memory issues */
131         flags = current->flags & PF_MEMALLOC;
132         current->flags |= PF_MEMALLOC;
133         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
134         current->flags &= (~PF_MEMALLOC | flags);
135         return vaddr;
136 }
137
138
139 static inline void *alloc_pgtable_page(void)
140 {
141         unsigned int flags;
142         void *vaddr;
143
144         /* trying to avoid low memory issues */
145         flags = current->flags & PF_MEMALLOC;
146         current->flags |= PF_MEMALLOC;
147         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
148         current->flags &= (~PF_MEMALLOC | flags);
149         return vaddr;
150 }
151
152 static inline void free_pgtable_page(void *vaddr)
153 {
154         free_page((unsigned long)vaddr);
155 }
156
157 static inline void *alloc_domain_mem(void)
158 {
159         return iommu_kmem_cache_alloc(iommu_domain_cache);
160 }
161
162 static void free_domain_mem(void *vaddr)
163 {
164         kmem_cache_free(iommu_domain_cache, vaddr);
165 }
166
167 static inline void * alloc_devinfo_mem(void)
168 {
169         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
170 }
171
172 static inline void free_devinfo_mem(void *vaddr)
173 {
174         kmem_cache_free(iommu_devinfo_cache, vaddr);
175 }
176
177 struct iova *alloc_iova_mem(void)
178 {
179         return iommu_kmem_cache_alloc(iommu_iova_cache);
180 }
181
182 void free_iova_mem(struct iova *iova)
183 {
184         kmem_cache_free(iommu_iova_cache, iova);
185 }
186
187 /* Gets context entry for a given bus and devfn */
188 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
189                 u8 bus, u8 devfn)
190 {
191         struct root_entry *root;
192         struct context_entry *context;
193         unsigned long phy_addr;
194         unsigned long flags;
195
196         spin_lock_irqsave(&iommu->lock, flags);
197         root = &iommu->root_entry[bus];
198         context = get_context_addr_from_root(root);
199         if (!context) {
200                 context = (struct context_entry *)alloc_pgtable_page();
201                 if (!context) {
202                         spin_unlock_irqrestore(&iommu->lock, flags);
203                         return NULL;
204                 }
205                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
206                 phy_addr = virt_to_phys((void *)context);
207                 set_root_value(root, phy_addr);
208                 set_root_present(root);
209                 __iommu_flush_cache(iommu, root, sizeof(*root));
210         }
211         spin_unlock_irqrestore(&iommu->lock, flags);
212         return &context[devfn];
213 }
214
215 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
216 {
217         struct root_entry *root;
218         struct context_entry *context;
219         int ret;
220         unsigned long flags;
221
222         spin_lock_irqsave(&iommu->lock, flags);
223         root = &iommu->root_entry[bus];
224         context = get_context_addr_from_root(root);
225         if (!context) {
226                 ret = 0;
227                 goto out;
228         }
229         ret = context_present(context[devfn]);
230 out:
231         spin_unlock_irqrestore(&iommu->lock, flags);
232         return ret;
233 }
234
235 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
236 {
237         struct root_entry *root;
238         struct context_entry *context;
239         unsigned long flags;
240
241         spin_lock_irqsave(&iommu->lock, flags);
242         root = &iommu->root_entry[bus];
243         context = get_context_addr_from_root(root);
244         if (context) {
245                 context_clear_entry(context[devfn]);
246                 __iommu_flush_cache(iommu, &context[devfn], \
247                         sizeof(*context));
248         }
249         spin_unlock_irqrestore(&iommu->lock, flags);
250 }
251
252 static void free_context_table(struct intel_iommu *iommu)
253 {
254         struct root_entry *root;
255         int i;
256         unsigned long flags;
257         struct context_entry *context;
258
259         spin_lock_irqsave(&iommu->lock, flags);
260         if (!iommu->root_entry) {
261                 goto out;
262         }
263         for (i = 0; i < ROOT_ENTRY_NR; i++) {
264                 root = &iommu->root_entry[i];
265                 context = get_context_addr_from_root(root);
266                 if (context)
267                         free_pgtable_page(context);
268         }
269         free_pgtable_page(iommu->root_entry);
270         iommu->root_entry = NULL;
271 out:
272         spin_unlock_irqrestore(&iommu->lock, flags);
273 }
274
275 /* page table handling */
276 #define LEVEL_STRIDE            (9)
277 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
278
279 static inline int agaw_to_level(int agaw)
280 {
281         return agaw + 2;
282 }
283
284 static inline int agaw_to_width(int agaw)
285 {
286         return 30 + agaw * LEVEL_STRIDE;
287
288 }
289
290 static inline int width_to_agaw(int width)
291 {
292         return (width - 30) / LEVEL_STRIDE;
293 }
294
295 static inline unsigned int level_to_offset_bits(int level)
296 {
297         return (12 + (level - 1) * LEVEL_STRIDE);
298 }
299
300 static inline int address_level_offset(u64 addr, int level)
301 {
302         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
303 }
304
305 static inline u64 level_mask(int level)
306 {
307         return ((u64)-1 << level_to_offset_bits(level));
308 }
309
310 static inline u64 level_size(int level)
311 {
312         return ((u64)1 << level_to_offset_bits(level));
313 }
314
315 static inline u64 align_to_level(u64 addr, int level)
316 {
317         return ((addr + level_size(level) - 1) & level_mask(level));
318 }
319
320 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
321 {
322         int addr_width = agaw_to_width(domain->agaw);
323         struct dma_pte *parent, *pte = NULL;
324         int level = agaw_to_level(domain->agaw);
325         int offset;
326         unsigned long flags;
327
328         BUG_ON(!domain->pgd);
329
330         addr &= (((u64)1) << addr_width) - 1;
331         parent = domain->pgd;
332
333         spin_lock_irqsave(&domain->mapping_lock, flags);
334         while (level > 0) {
335                 void *tmp_page;
336
337                 offset = address_level_offset(addr, level);
338                 pte = &parent[offset];
339                 if (level == 1)
340                         break;
341
342                 if (!dma_pte_present(*pte)) {
343                         tmp_page = alloc_pgtable_page();
344
345                         if (!tmp_page) {
346                                 spin_unlock_irqrestore(&domain->mapping_lock,
347                                         flags);
348                                 return NULL;
349                         }
350                         __iommu_flush_cache(domain->iommu, tmp_page,
351                                         PAGE_SIZE);
352                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
353                         /*
354                          * high level table always sets r/w, last level page
355                          * table control read/write
356                          */
357                         dma_set_pte_readable(*pte);
358                         dma_set_pte_writable(*pte);
359                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
360                 }
361                 parent = phys_to_virt(dma_pte_addr(*pte));
362                 level--;
363         }
364
365         spin_unlock_irqrestore(&domain->mapping_lock, flags);
366         return pte;
367 }
368
369 /* return address's pte at specific level */
370 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
371                 int level)
372 {
373         struct dma_pte *parent, *pte = NULL;
374         int total = agaw_to_level(domain->agaw);
375         int offset;
376
377         parent = domain->pgd;
378         while (level <= total) {
379                 offset = address_level_offset(addr, total);
380                 pte = &parent[offset];
381                 if (level == total)
382                         return pte;
383
384                 if (!dma_pte_present(*pte))
385                         break;
386                 parent = phys_to_virt(dma_pte_addr(*pte));
387                 total--;
388         }
389         return NULL;
390 }
391
392 /* clear one page's page table */
393 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
394 {
395         struct dma_pte *pte = NULL;
396
397         /* get last level pte */
398         pte = dma_addr_level_pte(domain, addr, 1);
399
400         if (pte) {
401                 dma_clear_pte(*pte);
402                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
403         }
404 }
405
406 /* clear last level pte, a tlb flush should be followed */
407 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
408 {
409         int addr_width = agaw_to_width(domain->agaw);
410
411         start &= (((u64)1) << addr_width) - 1;
412         end &= (((u64)1) << addr_width) - 1;
413         /* in case it's partial page */
414         start = PAGE_ALIGN(start);
415         end &= PAGE_MASK;
416
417         /* we don't need lock here, nobody else touches the iova range */
418         while (start < end) {
419                 dma_pte_clear_one(domain, start);
420                 start += VTD_PAGE_SIZE;
421         }
422 }
423
424 /* free page table pages. last level pte should already be cleared */
425 static void dma_pte_free_pagetable(struct dmar_domain *domain,
426         u64 start, u64 end)
427 {
428         int addr_width = agaw_to_width(domain->agaw);
429         struct dma_pte *pte;
430         int total = agaw_to_level(domain->agaw);
431         int level;
432         u64 tmp;
433
434         start &= (((u64)1) << addr_width) - 1;
435         end &= (((u64)1) << addr_width) - 1;
436
437         /* we don't need lock here, nobody else touches the iova range */
438         level = 2;
439         while (level <= total) {
440                 tmp = align_to_level(start, level);
441                 if (tmp >= end || (tmp + level_size(level) > end))
442                         return;
443
444                 while (tmp < end) {
445                         pte = dma_addr_level_pte(domain, tmp, level);
446                         if (pte) {
447                                 free_pgtable_page(
448                                         phys_to_virt(dma_pte_addr(*pte)));
449                                 dma_clear_pte(*pte);
450                                 __iommu_flush_cache(domain->iommu,
451                                                 pte, sizeof(*pte));
452                         }
453                         tmp += level_size(level);
454                 }
455                 level++;
456         }
457         /* free pgd */
458         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
459                 free_pgtable_page(domain->pgd);
460                 domain->pgd = NULL;
461         }
462 }
463
464 /* iommu handling */
465 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
466 {
467         struct root_entry *root;
468         unsigned long flags;
469
470         root = (struct root_entry *)alloc_pgtable_page();
471         if (!root)
472                 return -ENOMEM;
473
474         __iommu_flush_cache(iommu, root, ROOT_SIZE);
475
476         spin_lock_irqsave(&iommu->lock, flags);
477         iommu->root_entry = root;
478         spin_unlock_irqrestore(&iommu->lock, flags);
479
480         return 0;
481 }
482
483 static void iommu_set_root_entry(struct intel_iommu *iommu)
484 {
485         void *addr;
486         u32 cmd, sts;
487         unsigned long flag;
488
489         addr = iommu->root_entry;
490
491         spin_lock_irqsave(&iommu->register_lock, flag);
492         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
493
494         cmd = iommu->gcmd | DMA_GCMD_SRTP;
495         writel(cmd, iommu->reg + DMAR_GCMD_REG);
496
497         /* Make sure hardware complete it */
498         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
499                 readl, (sts & DMA_GSTS_RTPS), sts);
500
501         spin_unlock_irqrestore(&iommu->register_lock, flag);
502 }
503
504 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
505 {
506         u32 val;
507         unsigned long flag;
508
509         if (!cap_rwbf(iommu->cap))
510                 return;
511         val = iommu->gcmd | DMA_GCMD_WBF;
512
513         spin_lock_irqsave(&iommu->register_lock, flag);
514         writel(val, iommu->reg + DMAR_GCMD_REG);
515
516         /* Make sure hardware complete it */
517         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
518                         readl, (!(val & DMA_GSTS_WBFS)), val);
519
520         spin_unlock_irqrestore(&iommu->register_lock, flag);
521 }
522
523 /* return value determine if we need a write buffer flush */
524 static int __iommu_flush_context(struct intel_iommu *iommu,
525         u16 did, u16 source_id, u8 function_mask, u64 type,
526         int non_present_entry_flush)
527 {
528         u64 val = 0;
529         unsigned long flag;
530
531         /*
532          * In the non-present entry flush case, if hardware doesn't cache
533          * non-present entry we do nothing and if hardware cache non-present
534          * entry, we flush entries of domain 0 (the domain id is used to cache
535          * any non-present entries)
536          */
537         if (non_present_entry_flush) {
538                 if (!cap_caching_mode(iommu->cap))
539                         return 1;
540                 else
541                         did = 0;
542         }
543
544         switch (type) {
545         case DMA_CCMD_GLOBAL_INVL:
546                 val = DMA_CCMD_GLOBAL_INVL;
547                 break;
548         case DMA_CCMD_DOMAIN_INVL:
549                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
550                 break;
551         case DMA_CCMD_DEVICE_INVL:
552                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
553                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
554                 break;
555         default:
556                 BUG();
557         }
558         val |= DMA_CCMD_ICC;
559
560         spin_lock_irqsave(&iommu->register_lock, flag);
561         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
562
563         /* Make sure hardware complete it */
564         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
565                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
566
567         spin_unlock_irqrestore(&iommu->register_lock, flag);
568
569         /* flush context entry will implicitly flush write buffer */
570         return 0;
571 }
572
573 /* return value determine if we need a write buffer flush */
574 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
575         u64 addr, unsigned int size_order, u64 type,
576         int non_present_entry_flush)
577 {
578         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
579         u64 val = 0, val_iva = 0;
580         unsigned long flag;
581
582         /*
583          * In the non-present entry flush case, if hardware doesn't cache
584          * non-present entry we do nothing and if hardware cache non-present
585          * entry, we flush entries of domain 0 (the domain id is used to cache
586          * any non-present entries)
587          */
588         if (non_present_entry_flush) {
589                 if (!cap_caching_mode(iommu->cap))
590                         return 1;
591                 else
592                         did = 0;
593         }
594
595         switch (type) {
596         case DMA_TLB_GLOBAL_FLUSH:
597                 /* global flush doesn't need set IVA_REG */
598                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
599                 break;
600         case DMA_TLB_DSI_FLUSH:
601                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
602                 break;
603         case DMA_TLB_PSI_FLUSH:
604                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
605                 /* Note: always flush non-leaf currently */
606                 val_iva = size_order | addr;
607                 break;
608         default:
609                 BUG();
610         }
611         /* Note: set drain read/write */
612 #if 0
613         /*
614          * This is probably to be super secure.. Looks like we can
615          * ignore it without any impact.
616          */
617         if (cap_read_drain(iommu->cap))
618                 val |= DMA_TLB_READ_DRAIN;
619 #endif
620         if (cap_write_drain(iommu->cap))
621                 val |= DMA_TLB_WRITE_DRAIN;
622
623         spin_lock_irqsave(&iommu->register_lock, flag);
624         /* Note: Only uses first TLB reg currently */
625         if (val_iva)
626                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
627         dmar_writeq(iommu->reg + tlb_offset + 8, val);
628
629         /* Make sure hardware complete it */
630         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
631                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
632
633         spin_unlock_irqrestore(&iommu->register_lock, flag);
634
635         /* check IOTLB invalidation granularity */
636         if (DMA_TLB_IAIG(val) == 0)
637                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
638         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
639                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
640                         (unsigned long long)DMA_TLB_IIRG(type),
641                         (unsigned long long)DMA_TLB_IAIG(val));
642         /* flush iotlb entry will implicitly flush write buffer */
643         return 0;
644 }
645
646 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
647         u64 addr, unsigned int pages, int non_present_entry_flush)
648 {
649         unsigned int mask;
650
651         BUG_ON(addr & (~VTD_PAGE_MASK));
652         BUG_ON(pages == 0);
653
654         /* Fallback to domain selective flush if no PSI support */
655         if (!cap_pgsel_inv(iommu->cap))
656                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
657                                                 DMA_TLB_DSI_FLUSH,
658                                                 non_present_entry_flush);
659
660         /*
661          * PSI requires page size to be 2 ^ x, and the base address is naturally
662          * aligned to the size
663          */
664         mask = ilog2(__roundup_pow_of_two(pages));
665         /* Fallback to domain selective flush if size is too big */
666         if (mask > cap_max_amask_val(iommu->cap))
667                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
668                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
669
670         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
671                                         DMA_TLB_PSI_FLUSH,
672                                         non_present_entry_flush);
673 }
674
675 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
676 {
677         u32 pmen;
678         unsigned long flags;
679
680         spin_lock_irqsave(&iommu->register_lock, flags);
681         pmen = readl(iommu->reg + DMAR_PMEN_REG);
682         pmen &= ~DMA_PMEN_EPM;
683         writel(pmen, iommu->reg + DMAR_PMEN_REG);
684
685         /* wait for the protected region status bit to clear */
686         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
687                 readl, !(pmen & DMA_PMEN_PRS), pmen);
688
689         spin_unlock_irqrestore(&iommu->register_lock, flags);
690 }
691
692 static int iommu_enable_translation(struct intel_iommu *iommu)
693 {
694         u32 sts;
695         unsigned long flags;
696
697         spin_lock_irqsave(&iommu->register_lock, flags);
698         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
699
700         /* Make sure hardware complete it */
701         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
702                 readl, (sts & DMA_GSTS_TES), sts);
703
704         iommu->gcmd |= DMA_GCMD_TE;
705         spin_unlock_irqrestore(&iommu->register_lock, flags);
706         return 0;
707 }
708
709 static int iommu_disable_translation(struct intel_iommu *iommu)
710 {
711         u32 sts;
712         unsigned long flag;
713
714         spin_lock_irqsave(&iommu->register_lock, flag);
715         iommu->gcmd &= ~DMA_GCMD_TE;
716         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
717
718         /* Make sure hardware complete it */
719         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
720                 readl, (!(sts & DMA_GSTS_TES)), sts);
721
722         spin_unlock_irqrestore(&iommu->register_lock, flag);
723         return 0;
724 }
725
726 /* iommu interrupt handling. Most stuff are MSI-like. */
727
728 static const char *fault_reason_strings[] =
729 {
730         "Software",
731         "Present bit in root entry is clear",
732         "Present bit in context entry is clear",
733         "Invalid context entry",
734         "Access beyond MGAW",
735         "PTE Write access is not set",
736         "PTE Read access is not set",
737         "Next page table ptr is invalid",
738         "Root table address invalid",
739         "Context table ptr is invalid",
740         "non-zero reserved fields in RTP",
741         "non-zero reserved fields in CTP",
742         "non-zero reserved fields in PTE",
743 };
744 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
745
746 const char *dmar_get_fault_reason(u8 fault_reason)
747 {
748         if (fault_reason > MAX_FAULT_REASON_IDX)
749                 return "Unknown";
750         else
751                 return fault_reason_strings[fault_reason];
752 }
753
754 void dmar_msi_unmask(unsigned int irq)
755 {
756         struct intel_iommu *iommu = get_irq_data(irq);
757         unsigned long flag;
758
759         /* unmask it */
760         spin_lock_irqsave(&iommu->register_lock, flag);
761         writel(0, iommu->reg + DMAR_FECTL_REG);
762         /* Read a reg to force flush the post write */
763         readl(iommu->reg + DMAR_FECTL_REG);
764         spin_unlock_irqrestore(&iommu->register_lock, flag);
765 }
766
767 void dmar_msi_mask(unsigned int irq)
768 {
769         unsigned long flag;
770         struct intel_iommu *iommu = get_irq_data(irq);
771
772         /* mask it */
773         spin_lock_irqsave(&iommu->register_lock, flag);
774         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
775         /* Read a reg to force flush the post write */
776         readl(iommu->reg + DMAR_FECTL_REG);
777         spin_unlock_irqrestore(&iommu->register_lock, flag);
778 }
779
780 void dmar_msi_write(int irq, struct msi_msg *msg)
781 {
782         struct intel_iommu *iommu = get_irq_data(irq);
783         unsigned long flag;
784
785         spin_lock_irqsave(&iommu->register_lock, flag);
786         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
787         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
788         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
789         spin_unlock_irqrestore(&iommu->register_lock, flag);
790 }
791
792 void dmar_msi_read(int irq, struct msi_msg *msg)
793 {
794         struct intel_iommu *iommu = get_irq_data(irq);
795         unsigned long flag;
796
797         spin_lock_irqsave(&iommu->register_lock, flag);
798         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
799         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
800         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
801         spin_unlock_irqrestore(&iommu->register_lock, flag);
802 }
803
804 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
805                 u8 fault_reason, u16 source_id, unsigned long long addr)
806 {
807         const char *reason;
808
809         reason = dmar_get_fault_reason(fault_reason);
810
811         printk(KERN_ERR
812                 "DMAR:[%s] Request device [%02x:%02x.%d] "
813                 "fault addr %llx \n"
814                 "DMAR:[fault reason %02d] %s\n",
815                 (type ? "DMA Read" : "DMA Write"),
816                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
817                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
818         return 0;
819 }
820
821 #define PRIMARY_FAULT_REG_LEN (16)
822 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
823 {
824         struct intel_iommu *iommu = dev_id;
825         int reg, fault_index;
826         u32 fault_status;
827         unsigned long flag;
828
829         spin_lock_irqsave(&iommu->register_lock, flag);
830         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
831
832         /* TBD: ignore advanced fault log currently */
833         if (!(fault_status & DMA_FSTS_PPF))
834                 goto clear_overflow;
835
836         fault_index = dma_fsts_fault_record_index(fault_status);
837         reg = cap_fault_reg_offset(iommu->cap);
838         while (1) {
839                 u8 fault_reason;
840                 u16 source_id;
841                 u64 guest_addr;
842                 int type;
843                 u32 data;
844
845                 /* highest 32 bits */
846                 data = readl(iommu->reg + reg +
847                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
848                 if (!(data & DMA_FRCD_F))
849                         break;
850
851                 fault_reason = dma_frcd_fault_reason(data);
852                 type = dma_frcd_type(data);
853
854                 data = readl(iommu->reg + reg +
855                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
856                 source_id = dma_frcd_source_id(data);
857
858                 guest_addr = dmar_readq(iommu->reg + reg +
859                                 fault_index * PRIMARY_FAULT_REG_LEN);
860                 guest_addr = dma_frcd_page_addr(guest_addr);
861                 /* clear the fault */
862                 writel(DMA_FRCD_F, iommu->reg + reg +
863                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
864
865                 spin_unlock_irqrestore(&iommu->register_lock, flag);
866
867                 iommu_page_fault_do_one(iommu, type, fault_reason,
868                                 source_id, guest_addr);
869
870                 fault_index++;
871                 if (fault_index > cap_num_fault_regs(iommu->cap))
872                         fault_index = 0;
873                 spin_lock_irqsave(&iommu->register_lock, flag);
874         }
875 clear_overflow:
876         /* clear primary fault overflow */
877         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
878         if (fault_status & DMA_FSTS_PFO)
879                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
880
881         spin_unlock_irqrestore(&iommu->register_lock, flag);
882         return IRQ_HANDLED;
883 }
884
885 int dmar_set_interrupt(struct intel_iommu *iommu)
886 {
887         int irq, ret;
888
889         irq = create_irq();
890         if (!irq) {
891                 printk(KERN_ERR "IOMMU: no free vectors\n");
892                 return -EINVAL;
893         }
894
895         set_irq_data(irq, iommu);
896         iommu->irq = irq;
897
898         ret = arch_setup_dmar_msi(irq);
899         if (ret) {
900                 set_irq_data(irq, NULL);
901                 iommu->irq = 0;
902                 destroy_irq(irq);
903                 return 0;
904         }
905
906         /* Force fault register is cleared */
907         iommu_page_fault(irq, iommu);
908
909         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
910         if (ret)
911                 printk(KERN_ERR "IOMMU: can't request irq\n");
912         return ret;
913 }
914
915 static int iommu_init_domains(struct intel_iommu *iommu)
916 {
917         unsigned long ndomains;
918         unsigned long nlongs;
919
920         ndomains = cap_ndoms(iommu->cap);
921         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
922         nlongs = BITS_TO_LONGS(ndomains);
923
924         /* TBD: there might be 64K domains,
925          * consider other allocation for future chip
926          */
927         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
928         if (!iommu->domain_ids) {
929                 printk(KERN_ERR "Allocating domain id array failed\n");
930                 return -ENOMEM;
931         }
932         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
933                         GFP_KERNEL);
934         if (!iommu->domains) {
935                 printk(KERN_ERR "Allocating domain array failed\n");
936                 kfree(iommu->domain_ids);
937                 return -ENOMEM;
938         }
939
940         spin_lock_init(&iommu->lock);
941
942         /*
943          * if Caching mode is set, then invalid translations are tagged
944          * with domainid 0. Hence we need to pre-allocate it.
945          */
946         if (cap_caching_mode(iommu->cap))
947                 set_bit(0, iommu->domain_ids);
948         return 0;
949 }
950
951
952 static void domain_exit(struct dmar_domain *domain);
953
954 void free_dmar_iommu(struct intel_iommu *iommu)
955 {
956         struct dmar_domain *domain;
957         int i;
958
959         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
960         for (; i < cap_ndoms(iommu->cap); ) {
961                 domain = iommu->domains[i];
962                 clear_bit(i, iommu->domain_ids);
963                 domain_exit(domain);
964                 i = find_next_bit(iommu->domain_ids,
965                         cap_ndoms(iommu->cap), i+1);
966         }
967
968         if (iommu->gcmd & DMA_GCMD_TE)
969                 iommu_disable_translation(iommu);
970
971         if (iommu->irq) {
972                 set_irq_data(iommu->irq, NULL);
973                 /* This will mask the irq */
974                 free_irq(iommu->irq, iommu);
975                 destroy_irq(iommu->irq);
976         }
977
978         kfree(iommu->domains);
979         kfree(iommu->domain_ids);
980
981         /* free context mapping */
982         free_context_table(iommu);
983 }
984
985 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
986 {
987         unsigned long num;
988         unsigned long ndomains;
989         struct dmar_domain *domain;
990         unsigned long flags;
991
992         domain = alloc_domain_mem();
993         if (!domain)
994                 return NULL;
995
996         ndomains = cap_ndoms(iommu->cap);
997
998         spin_lock_irqsave(&iommu->lock, flags);
999         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1000         if (num >= ndomains) {
1001                 spin_unlock_irqrestore(&iommu->lock, flags);
1002                 free_domain_mem(domain);
1003                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1004                 return NULL;
1005         }
1006
1007         set_bit(num, iommu->domain_ids);
1008         domain->id = num;
1009         domain->iommu = iommu;
1010         iommu->domains[num] = domain;
1011         spin_unlock_irqrestore(&iommu->lock, flags);
1012
1013         return domain;
1014 }
1015
1016 static void iommu_free_domain(struct dmar_domain *domain)
1017 {
1018         unsigned long flags;
1019
1020         spin_lock_irqsave(&domain->iommu->lock, flags);
1021         clear_bit(domain->id, domain->iommu->domain_ids);
1022         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1023 }
1024
1025 static struct iova_domain reserved_iova_list;
1026 static struct lock_class_key reserved_alloc_key;
1027 static struct lock_class_key reserved_rbtree_key;
1028
1029 static void dmar_init_reserved_ranges(void)
1030 {
1031         struct pci_dev *pdev = NULL;
1032         struct iova *iova;
1033         int i;
1034         u64 addr, size;
1035
1036         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1037
1038         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1039                 &reserved_alloc_key);
1040         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1041                 &reserved_rbtree_key);
1042
1043         /* IOAPIC ranges shouldn't be accessed by DMA */
1044         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1045                 IOVA_PFN(IOAPIC_RANGE_END));
1046         if (!iova)
1047                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1048
1049         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1050         for_each_pci_dev(pdev) {
1051                 struct resource *r;
1052
1053                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1054                         r = &pdev->resource[i];
1055                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1056                                 continue;
1057                         addr = r->start;
1058                         addr &= PAGE_MASK;
1059                         size = r->end - addr;
1060                         size = PAGE_ALIGN(size);
1061                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1062                                 IOVA_PFN(size + addr) - 1);
1063                         if (!iova)
1064                                 printk(KERN_ERR "Reserve iova failed\n");
1065                 }
1066         }
1067
1068 }
1069
1070 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1071 {
1072         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1073 }
1074
1075 static inline int guestwidth_to_adjustwidth(int gaw)
1076 {
1077         int agaw;
1078         int r = (gaw - 12) % 9;
1079
1080         if (r == 0)
1081                 agaw = gaw;
1082         else
1083                 agaw = gaw + 9 - r;
1084         if (agaw > 64)
1085                 agaw = 64;
1086         return agaw;
1087 }
1088
1089 static int domain_init(struct dmar_domain *domain, int guest_width)
1090 {
1091         struct intel_iommu *iommu;
1092         int adjust_width, agaw;
1093         unsigned long sagaw;
1094
1095         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1096         spin_lock_init(&domain->mapping_lock);
1097
1098         domain_reserve_special_ranges(domain);
1099
1100         /* calculate AGAW */
1101         iommu = domain->iommu;
1102         if (guest_width > cap_mgaw(iommu->cap))
1103                 guest_width = cap_mgaw(iommu->cap);
1104         domain->gaw = guest_width;
1105         adjust_width = guestwidth_to_adjustwidth(guest_width);
1106         agaw = width_to_agaw(adjust_width);
1107         sagaw = cap_sagaw(iommu->cap);
1108         if (!test_bit(agaw, &sagaw)) {
1109                 /* hardware doesn't support it, choose a bigger one */
1110                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1111                 agaw = find_next_bit(&sagaw, 5, agaw);
1112                 if (agaw >= 5)
1113                         return -ENODEV;
1114         }
1115         domain->agaw = agaw;
1116         INIT_LIST_HEAD(&domain->devices);
1117
1118         /* always allocate the top pgd */
1119         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1120         if (!domain->pgd)
1121                 return -ENOMEM;
1122         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1123         return 0;
1124 }
1125
1126 static void domain_exit(struct dmar_domain *domain)
1127 {
1128         u64 end;
1129
1130         /* Domain 0 is reserved, so dont process it */
1131         if (!domain)
1132                 return;
1133
1134         domain_remove_dev_info(domain);
1135         /* destroy iovas */
1136         put_iova_domain(&domain->iovad);
1137         end = DOMAIN_MAX_ADDR(domain->gaw);
1138         end = end & (~PAGE_MASK);
1139
1140         /* clear ptes */
1141         dma_pte_clear_range(domain, 0, end);
1142
1143         /* free page tables */
1144         dma_pte_free_pagetable(domain, 0, end);
1145
1146         iommu_free_domain(domain);
1147         free_domain_mem(domain);
1148 }
1149
1150 static int domain_context_mapping_one(struct dmar_domain *domain,
1151                 u8 bus, u8 devfn)
1152 {
1153         struct context_entry *context;
1154         struct intel_iommu *iommu = domain->iommu;
1155         unsigned long flags;
1156
1157         pr_debug("Set context mapping for %02x:%02x.%d\n",
1158                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1159         BUG_ON(!domain->pgd);
1160         context = device_to_context_entry(iommu, bus, devfn);
1161         if (!context)
1162                 return -ENOMEM;
1163         spin_lock_irqsave(&iommu->lock, flags);
1164         if (context_present(*context)) {
1165                 spin_unlock_irqrestore(&iommu->lock, flags);
1166                 return 0;
1167         }
1168
1169         context_set_domain_id(*context, domain->id);
1170         context_set_address_width(*context, domain->agaw);
1171         context_set_address_root(*context, virt_to_phys(domain->pgd));
1172         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1173         context_set_fault_enable(*context);
1174         context_set_present(*context);
1175         __iommu_flush_cache(iommu, context, sizeof(*context));
1176
1177         /* it's a non-present to present mapping */
1178         if (iommu->flush.flush_context(iommu, domain->id,
1179                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1180                 DMA_CCMD_DEVICE_INVL, 1))
1181                 iommu_flush_write_buffer(iommu);
1182         else
1183                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1184
1185         spin_unlock_irqrestore(&iommu->lock, flags);
1186         return 0;
1187 }
1188
1189 static int
1190 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1191 {
1192         int ret;
1193         struct pci_dev *tmp, *parent;
1194
1195         ret = domain_context_mapping_one(domain, pdev->bus->number,
1196                 pdev->devfn);
1197         if (ret)
1198                 return ret;
1199
1200         /* dependent device mapping */
1201         tmp = pci_find_upstream_pcie_bridge(pdev);
1202         if (!tmp)
1203                 return 0;
1204         /* Secondary interface's bus number and devfn 0 */
1205         parent = pdev->bus->self;
1206         while (parent != tmp) {
1207                 ret = domain_context_mapping_one(domain, parent->bus->number,
1208                         parent->devfn);
1209                 if (ret)
1210                         return ret;
1211                 parent = parent->bus->self;
1212         }
1213         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1214                 return domain_context_mapping_one(domain,
1215                         tmp->subordinate->number, 0);
1216         else /* this is a legacy PCI bridge */
1217                 return domain_context_mapping_one(domain,
1218                         tmp->bus->number, tmp->devfn);
1219 }
1220
1221 static int domain_context_mapped(struct dmar_domain *domain,
1222         struct pci_dev *pdev)
1223 {
1224         int ret;
1225         struct pci_dev *tmp, *parent;
1226
1227         ret = device_context_mapped(domain->iommu,
1228                 pdev->bus->number, pdev->devfn);
1229         if (!ret)
1230                 return ret;
1231         /* dependent device mapping */
1232         tmp = pci_find_upstream_pcie_bridge(pdev);
1233         if (!tmp)
1234                 return ret;
1235         /* Secondary interface's bus number and devfn 0 */
1236         parent = pdev->bus->self;
1237         while (parent != tmp) {
1238                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1239                         parent->devfn);
1240                 if (!ret)
1241                         return ret;
1242                 parent = parent->bus->self;
1243         }
1244         if (tmp->is_pcie)
1245                 return device_context_mapped(domain->iommu,
1246                         tmp->subordinate->number, 0);
1247         else
1248                 return device_context_mapped(domain->iommu,
1249                         tmp->bus->number, tmp->devfn);
1250 }
1251
1252 static int
1253 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1254                         u64 hpa, size_t size, int prot)
1255 {
1256         u64 start_pfn, end_pfn;
1257         struct dma_pte *pte;
1258         int index;
1259         int addr_width = agaw_to_width(domain->agaw);
1260
1261         hpa &= (((u64)1) << addr_width) - 1;
1262
1263         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1264                 return -EINVAL;
1265         iova &= PAGE_MASK;
1266         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1267         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1268         index = 0;
1269         while (start_pfn < end_pfn) {
1270                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1271                 if (!pte)
1272                         return -ENOMEM;
1273                 /* We don't need lock here, nobody else
1274                  * touches the iova range
1275                  */
1276                 BUG_ON(dma_pte_addr(*pte));
1277                 dma_set_pte_addr(*pte, start_pfn << VTD_PAGE_SHIFT);
1278                 dma_set_pte_prot(*pte, prot);
1279                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1280                 start_pfn++;
1281                 index++;
1282         }
1283         return 0;
1284 }
1285
1286 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1287 {
1288         clear_context_table(domain->iommu, bus, devfn);
1289         domain->iommu->flush.flush_context(domain->iommu, 0, 0, 0,
1290                                            DMA_CCMD_GLOBAL_INVL, 0);
1291         domain->iommu->flush.flush_iotlb(domain->iommu, 0, 0, 0,
1292                                          DMA_TLB_GLOBAL_FLUSH, 0);
1293 }
1294
1295 static void domain_remove_dev_info(struct dmar_domain *domain)
1296 {
1297         struct device_domain_info *info;
1298         unsigned long flags;
1299
1300         spin_lock_irqsave(&device_domain_lock, flags);
1301         while (!list_empty(&domain->devices)) {
1302                 info = list_entry(domain->devices.next,
1303                         struct device_domain_info, link);
1304                 list_del(&info->link);
1305                 list_del(&info->global);
1306                 if (info->dev)
1307                         info->dev->dev.archdata.iommu = NULL;
1308                 spin_unlock_irqrestore(&device_domain_lock, flags);
1309
1310                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1311                 free_devinfo_mem(info);
1312
1313                 spin_lock_irqsave(&device_domain_lock, flags);
1314         }
1315         spin_unlock_irqrestore(&device_domain_lock, flags);
1316 }
1317
1318 /*
1319  * find_domain
1320  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1321  */
1322 static struct dmar_domain *
1323 find_domain(struct pci_dev *pdev)
1324 {
1325         struct device_domain_info *info;
1326
1327         /* No lock here, assumes no domain exit in normal case */
1328         info = pdev->dev.archdata.iommu;
1329         if (info)
1330                 return info->domain;
1331         return NULL;
1332 }
1333
1334 /* domain is initialized */
1335 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1336 {
1337         struct dmar_domain *domain, *found = NULL;
1338         struct intel_iommu *iommu;
1339         struct dmar_drhd_unit *drhd;
1340         struct device_domain_info *info, *tmp;
1341         struct pci_dev *dev_tmp;
1342         unsigned long flags;
1343         int bus = 0, devfn = 0;
1344
1345         domain = find_domain(pdev);
1346         if (domain)
1347                 return domain;
1348
1349         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1350         if (dev_tmp) {
1351                 if (dev_tmp->is_pcie) {
1352                         bus = dev_tmp->subordinate->number;
1353                         devfn = 0;
1354                 } else {
1355                         bus = dev_tmp->bus->number;
1356                         devfn = dev_tmp->devfn;
1357                 }
1358                 spin_lock_irqsave(&device_domain_lock, flags);
1359                 list_for_each_entry(info, &device_domain_list, global) {
1360                         if (info->bus == bus && info->devfn == devfn) {
1361                                 found = info->domain;
1362                                 break;
1363                         }
1364                 }
1365                 spin_unlock_irqrestore(&device_domain_lock, flags);
1366                 /* pcie-pci bridge already has a domain, uses it */
1367                 if (found) {
1368                         domain = found;
1369                         goto found_domain;
1370                 }
1371         }
1372
1373         /* Allocate new domain for the device */
1374         drhd = dmar_find_matched_drhd_unit(pdev);
1375         if (!drhd) {
1376                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1377                         pci_name(pdev));
1378                 return NULL;
1379         }
1380         iommu = drhd->iommu;
1381
1382         domain = iommu_alloc_domain(iommu);
1383         if (!domain)
1384                 goto error;
1385
1386         if (domain_init(domain, gaw)) {
1387                 domain_exit(domain);
1388                 goto error;
1389         }
1390
1391         /* register pcie-to-pci device */
1392         if (dev_tmp) {
1393                 info = alloc_devinfo_mem();
1394                 if (!info) {
1395                         domain_exit(domain);
1396                         goto error;
1397                 }
1398                 info->bus = bus;
1399                 info->devfn = devfn;
1400                 info->dev = NULL;
1401                 info->domain = domain;
1402                 /* This domain is shared by devices under p2p bridge */
1403                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1404
1405                 /* pcie-to-pci bridge already has a domain, uses it */
1406                 found = NULL;
1407                 spin_lock_irqsave(&device_domain_lock, flags);
1408                 list_for_each_entry(tmp, &device_domain_list, global) {
1409                         if (tmp->bus == bus && tmp->devfn == devfn) {
1410                                 found = tmp->domain;
1411                                 break;
1412                         }
1413                 }
1414                 if (found) {
1415                         free_devinfo_mem(info);
1416                         domain_exit(domain);
1417                         domain = found;
1418                 } else {
1419                         list_add(&info->link, &domain->devices);
1420                         list_add(&info->global, &device_domain_list);
1421                 }
1422                 spin_unlock_irqrestore(&device_domain_lock, flags);
1423         }
1424
1425 found_domain:
1426         info = alloc_devinfo_mem();
1427         if (!info)
1428                 goto error;
1429         info->bus = pdev->bus->number;
1430         info->devfn = pdev->devfn;
1431         info->dev = pdev;
1432         info->domain = domain;
1433         spin_lock_irqsave(&device_domain_lock, flags);
1434         /* somebody is fast */
1435         found = find_domain(pdev);
1436         if (found != NULL) {
1437                 spin_unlock_irqrestore(&device_domain_lock, flags);
1438                 if (found != domain) {
1439                         domain_exit(domain);
1440                         domain = found;
1441                 }
1442                 free_devinfo_mem(info);
1443                 return domain;
1444         }
1445         list_add(&info->link, &domain->devices);
1446         list_add(&info->global, &device_domain_list);
1447         pdev->dev.archdata.iommu = info;
1448         spin_unlock_irqrestore(&device_domain_lock, flags);
1449         return domain;
1450 error:
1451         /* recheck it here, maybe others set it */
1452         return find_domain(pdev);
1453 }
1454
1455 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1456                                       unsigned long long start,
1457                                       unsigned long long end)
1458 {
1459         struct dmar_domain *domain;
1460         unsigned long size;
1461         unsigned long long base;
1462         int ret;
1463
1464         printk(KERN_INFO
1465                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1466                 pci_name(pdev), start, end);
1467         /* page table init */
1468         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1469         if (!domain)
1470                 return -ENOMEM;
1471
1472         /* The address might not be aligned */
1473         base = start & PAGE_MASK;
1474         size = end - base;
1475         size = PAGE_ALIGN(size);
1476         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1477                         IOVA_PFN(base + size) - 1)) {
1478                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1479                 ret = -ENOMEM;
1480                 goto error;
1481         }
1482
1483         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1484                 size, base, pci_name(pdev));
1485         /*
1486          * RMRR range might have overlap with physical memory range,
1487          * clear it first
1488          */
1489         dma_pte_clear_range(domain, base, base + size);
1490
1491         ret = domain_page_mapping(domain, base, base, size,
1492                 DMA_PTE_READ|DMA_PTE_WRITE);
1493         if (ret)
1494                 goto error;
1495
1496         /* context entry init */
1497         ret = domain_context_mapping(domain, pdev);
1498         if (!ret)
1499                 return 0;
1500 error:
1501         domain_exit(domain);
1502         return ret;
1503
1504 }
1505
1506 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1507         struct pci_dev *pdev)
1508 {
1509         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1510                 return 0;
1511         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1512                 rmrr->end_address + 1);
1513 }
1514
1515 #ifdef CONFIG_DMAR_GFX_WA
1516 struct iommu_prepare_data {
1517         struct pci_dev *pdev;
1518         int ret;
1519 };
1520
1521 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1522                                          unsigned long end_pfn, void *datax)
1523 {
1524         struct iommu_prepare_data *data;
1525
1526         data = (struct iommu_prepare_data *)datax;
1527
1528         data->ret = iommu_prepare_identity_map(data->pdev,
1529                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1530         return data->ret;
1531
1532 }
1533
1534 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1535 {
1536         int nid;
1537         struct iommu_prepare_data data;
1538
1539         data.pdev = pdev;
1540         data.ret = 0;
1541
1542         for_each_online_node(nid) {
1543                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1544                 if (data.ret)
1545                         return data.ret;
1546         }
1547         return data.ret;
1548 }
1549
1550 static void __init iommu_prepare_gfx_mapping(void)
1551 {
1552         struct pci_dev *pdev = NULL;
1553         int ret;
1554
1555         for_each_pci_dev(pdev) {
1556                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1557                                 !IS_GFX_DEVICE(pdev))
1558                         continue;
1559                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1560                         pci_name(pdev));
1561                 ret = iommu_prepare_with_active_regions(pdev);
1562                 if (ret)
1563                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1564         }
1565 }
1566 #endif
1567
1568 #ifdef CONFIG_DMAR_FLOPPY_WA
1569 static inline void iommu_prepare_isa(void)
1570 {
1571         struct pci_dev *pdev;
1572         int ret;
1573
1574         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1575         if (!pdev)
1576                 return;
1577
1578         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1579         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1580
1581         if (ret)
1582                 printk("IOMMU: Failed to create 0-64M identity map, "
1583                         "floppy might not work\n");
1584
1585 }
1586 #else
1587 static inline void iommu_prepare_isa(void)
1588 {
1589         return;
1590 }
1591 #endif /* !CONFIG_DMAR_FLPY_WA */
1592
1593 int __init init_dmars(void)
1594 {
1595         struct dmar_drhd_unit *drhd;
1596         struct dmar_rmrr_unit *rmrr;
1597         struct pci_dev *pdev;
1598         struct intel_iommu *iommu;
1599         int i, ret, unit = 0;
1600
1601         /*
1602          * for each drhd
1603          *    allocate root
1604          *    initialize and program root entry to not present
1605          * endfor
1606          */
1607         for_each_drhd_unit(drhd) {
1608                 g_num_of_iommus++;
1609                 /*
1610                  * lock not needed as this is only incremented in the single
1611                  * threaded kernel __init code path all other access are read
1612                  * only
1613                  */
1614         }
1615
1616         deferred_flush = kzalloc(g_num_of_iommus *
1617                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1618         if (!deferred_flush) {
1619                 ret = -ENOMEM;
1620                 goto error;
1621         }
1622
1623         for_each_drhd_unit(drhd) {
1624                 if (drhd->ignored)
1625                         continue;
1626
1627                 iommu = drhd->iommu;
1628
1629                 ret = iommu_init_domains(iommu);
1630                 if (ret)
1631                         goto error;
1632
1633                 /*
1634                  * TBD:
1635                  * we could share the same root & context tables
1636                  * amoung all IOMMU's. Need to Split it later.
1637                  */
1638                 ret = iommu_alloc_root_entry(iommu);
1639                 if (ret) {
1640                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1641                         goto error;
1642                 }
1643         }
1644
1645         for_each_drhd_unit(drhd) {
1646                 if (drhd->ignored)
1647                         continue;
1648
1649                 iommu = drhd->iommu;
1650                 if (dmar_enable_qi(iommu)) {
1651                         /*
1652                          * Queued Invalidate not enabled, use Register Based
1653                          * Invalidate
1654                          */
1655                         iommu->flush.flush_context = __iommu_flush_context;
1656                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
1657                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
1658                                "invalidation\n",
1659                                (unsigned long long)drhd->reg_base_addr);
1660                 } else {
1661                         iommu->flush.flush_context = qi_flush_context;
1662                         iommu->flush.flush_iotlb = qi_flush_iotlb;
1663                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
1664                                "invalidation\n",
1665                                (unsigned long long)drhd->reg_base_addr);
1666                 }
1667         }
1668
1669         /*
1670          * For each rmrr
1671          *   for each dev attached to rmrr
1672          *   do
1673          *     locate drhd for dev, alloc domain for dev
1674          *     allocate free domain
1675          *     allocate page table entries for rmrr
1676          *     if context not allocated for bus
1677          *           allocate and init context
1678          *           set present in root table for this bus
1679          *     init context with domain, translation etc
1680          *    endfor
1681          * endfor
1682          */
1683         for_each_rmrr_units(rmrr) {
1684                 for (i = 0; i < rmrr->devices_cnt; i++) {
1685                         pdev = rmrr->devices[i];
1686                         /* some BIOS lists non-exist devices in DMAR table */
1687                         if (!pdev)
1688                                 continue;
1689                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1690                         if (ret)
1691                                 printk(KERN_ERR
1692                                  "IOMMU: mapping reserved region failed\n");
1693                 }
1694         }
1695
1696         iommu_prepare_gfx_mapping();
1697
1698         iommu_prepare_isa();
1699
1700         /*
1701          * for each drhd
1702          *   enable fault log
1703          *   global invalidate context cache
1704          *   global invalidate iotlb
1705          *   enable translation
1706          */
1707         for_each_drhd_unit(drhd) {
1708                 if (drhd->ignored)
1709                         continue;
1710                 iommu = drhd->iommu;
1711                 sprintf (iommu->name, "dmar%d", unit++);
1712
1713                 iommu_flush_write_buffer(iommu);
1714
1715                 ret = dmar_set_interrupt(iommu);
1716                 if (ret)
1717                         goto error;
1718
1719                 iommu_set_root_entry(iommu);
1720
1721                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
1722                                            0);
1723                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
1724                                          0);
1725                 iommu_disable_protect_mem_regions(iommu);
1726
1727                 ret = iommu_enable_translation(iommu);
1728                 if (ret)
1729                         goto error;
1730         }
1731
1732         return 0;
1733 error:
1734         for_each_drhd_unit(drhd) {
1735                 if (drhd->ignored)
1736                         continue;
1737                 iommu = drhd->iommu;
1738                 free_iommu(iommu);
1739         }
1740         return ret;
1741 }
1742
1743 static inline u64 aligned_size(u64 host_addr, size_t size)
1744 {
1745         u64 addr;
1746         addr = (host_addr & (~PAGE_MASK)) + size;
1747         return PAGE_ALIGN(addr);
1748 }
1749
1750 struct iova *
1751 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1752 {
1753         struct iova *piova;
1754
1755         /* Make sure it's in range */
1756         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1757         if (!size || (IOVA_START_ADDR + size > end))
1758                 return NULL;
1759
1760         piova = alloc_iova(&domain->iovad,
1761                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
1762         return piova;
1763 }
1764
1765 static struct iova *
1766 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1767                    size_t size, u64 dma_mask)
1768 {
1769         struct pci_dev *pdev = to_pci_dev(dev);
1770         struct iova *iova = NULL;
1771
1772         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
1773                 iova = iommu_alloc_iova(domain, size, dma_mask);
1774         else {
1775                 /*
1776                  * First try to allocate an io virtual address in
1777                  * DMA_32BIT_MASK and if that fails then try allocating
1778                  * from higher range
1779                  */
1780                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1781                 if (!iova)
1782                         iova = iommu_alloc_iova(domain, size, dma_mask);
1783         }
1784
1785         if (!iova) {
1786                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1787                 return NULL;
1788         }
1789
1790         return iova;
1791 }
1792
1793 static struct dmar_domain *
1794 get_valid_domain_for_dev(struct pci_dev *pdev)
1795 {
1796         struct dmar_domain *domain;
1797         int ret;
1798
1799         domain = get_domain_for_dev(pdev,
1800                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1801         if (!domain) {
1802                 printk(KERN_ERR
1803                         "Allocating domain for %s failed", pci_name(pdev));
1804                 return NULL;
1805         }
1806
1807         /* make sure context mapping is ok */
1808         if (unlikely(!domain_context_mapped(domain, pdev))) {
1809                 ret = domain_context_mapping(domain, pdev);
1810                 if (ret) {
1811                         printk(KERN_ERR
1812                                 "Domain context map for %s failed",
1813                                 pci_name(pdev));
1814                         return NULL;
1815                 }
1816         }
1817
1818         return domain;
1819 }
1820
1821 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
1822                                      size_t size, int dir, u64 dma_mask)
1823 {
1824         struct pci_dev *pdev = to_pci_dev(hwdev);
1825         struct dmar_domain *domain;
1826         phys_addr_t start_paddr;
1827         struct iova *iova;
1828         int prot = 0;
1829         int ret;
1830
1831         BUG_ON(dir == DMA_NONE);
1832         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1833                 return paddr;
1834
1835         domain = get_valid_domain_for_dev(pdev);
1836         if (!domain)
1837                 return 0;
1838
1839         size = aligned_size((u64)paddr, size);
1840
1841         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
1842         if (!iova)
1843                 goto error;
1844
1845         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
1846
1847         /*
1848          * Check if DMAR supports zero-length reads on write only
1849          * mappings..
1850          */
1851         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1852                         !cap_zlr(domain->iommu->cap))
1853                 prot |= DMA_PTE_READ;
1854         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1855                 prot |= DMA_PTE_WRITE;
1856         /*
1857          * paddr - (paddr + size) might be partial page, we should map the whole
1858          * page.  Note: if two part of one page are separately mapped, we
1859          * might have two guest_addr mapping to the same host paddr, but this
1860          * is not a big problem
1861          */
1862         ret = domain_page_mapping(domain, start_paddr,
1863                 ((u64)paddr) & PAGE_MASK, size, prot);
1864         if (ret)
1865                 goto error;
1866
1867         /* it's a non-present to present mapping */
1868         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1869                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
1870         if (ret)
1871                 iommu_flush_write_buffer(domain->iommu);
1872
1873         return start_paddr + ((u64)paddr & (~PAGE_MASK));
1874
1875 error:
1876         if (iova)
1877                 __free_iova(&domain->iovad, iova);
1878         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1879                 pci_name(pdev), size, (unsigned long long)paddr, dir);
1880         return 0;
1881 }
1882
1883 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
1884                             size_t size, int dir)
1885 {
1886         return __intel_map_single(hwdev, paddr, size, dir,
1887                                   to_pci_dev(hwdev)->dma_mask);
1888 }
1889
1890 static void flush_unmaps(void)
1891 {
1892         int i, j;
1893
1894         timer_on = 0;
1895
1896         /* just flush them all */
1897         for (i = 0; i < g_num_of_iommus; i++) {
1898                 if (deferred_flush[i].next) {
1899                         struct intel_iommu *iommu =
1900                                 deferred_flush[i].domain[0]->iommu;
1901
1902                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1903                                                  DMA_TLB_GLOBAL_FLUSH, 0);
1904                         for (j = 0; j < deferred_flush[i].next; j++) {
1905                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1906                                                 deferred_flush[i].iova[j]);
1907                         }
1908                         deferred_flush[i].next = 0;
1909                 }
1910         }
1911
1912         list_size = 0;
1913 }
1914
1915 static void flush_unmaps_timeout(unsigned long data)
1916 {
1917         unsigned long flags;
1918
1919         spin_lock_irqsave(&async_umap_flush_lock, flags);
1920         flush_unmaps();
1921         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1922 }
1923
1924 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1925 {
1926         unsigned long flags;
1927         int next, iommu_id;
1928
1929         spin_lock_irqsave(&async_umap_flush_lock, flags);
1930         if (list_size == HIGH_WATER_MARK)
1931                 flush_unmaps();
1932
1933         iommu_id = dom->iommu->seq_id;
1934
1935         next = deferred_flush[iommu_id].next;
1936         deferred_flush[iommu_id].domain[next] = dom;
1937         deferred_flush[iommu_id].iova[next] = iova;
1938         deferred_flush[iommu_id].next++;
1939
1940         if (!timer_on) {
1941                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1942                 timer_on = 1;
1943         }
1944         list_size++;
1945         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1946 }
1947
1948 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
1949                         int dir)
1950 {
1951         struct pci_dev *pdev = to_pci_dev(dev);
1952         struct dmar_domain *domain;
1953         unsigned long start_addr;
1954         struct iova *iova;
1955
1956         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1957                 return;
1958         domain = find_domain(pdev);
1959         BUG_ON(!domain);
1960
1961         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1962         if (!iova)
1963                 return;
1964
1965         start_addr = iova->pfn_lo << PAGE_SHIFT;
1966         size = aligned_size((u64)dev_addr, size);
1967
1968         pr_debug("Device %s unmapping: %lx@%llx\n",
1969                 pci_name(pdev), size, (unsigned long long)start_addr);
1970
1971         /*  clear the whole page */
1972         dma_pte_clear_range(domain, start_addr, start_addr + size);
1973         /* free page tables */
1974         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1975         if (intel_iommu_strict) {
1976                 if (iommu_flush_iotlb_psi(domain->iommu,
1977                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
1978                         iommu_flush_write_buffer(domain->iommu);
1979                 /* free iova */
1980                 __free_iova(&domain->iovad, iova);
1981         } else {
1982                 add_unmap(domain, iova);
1983                 /*
1984                  * queue up the release of the unmap to save the 1/6th of the
1985                  * cpu used up by the iotlb flush operation...
1986                  */
1987         }
1988 }
1989
1990 void *intel_alloc_coherent(struct device *hwdev, size_t size,
1991                            dma_addr_t *dma_handle, gfp_t flags)
1992 {
1993         void *vaddr;
1994         int order;
1995
1996         size = PAGE_ALIGN(size);
1997         order = get_order(size);
1998         flags &= ~(GFP_DMA | GFP_DMA32);
1999
2000         vaddr = (void *)__get_free_pages(flags, order);
2001         if (!vaddr)
2002                 return NULL;
2003         memset(vaddr, 0, size);
2004
2005         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2006                                          DMA_BIDIRECTIONAL,
2007                                          hwdev->coherent_dma_mask);
2008         if (*dma_handle)
2009                 return vaddr;
2010         free_pages((unsigned long)vaddr, order);
2011         return NULL;
2012 }
2013
2014 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2015                          dma_addr_t dma_handle)
2016 {
2017         int order;
2018
2019         size = PAGE_ALIGN(size);
2020         order = get_order(size);
2021
2022         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2023         free_pages((unsigned long)vaddr, order);
2024 }
2025
2026 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2027
2028 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2029                     int nelems, int dir)
2030 {
2031         int i;
2032         struct pci_dev *pdev = to_pci_dev(hwdev);
2033         struct dmar_domain *domain;
2034         unsigned long start_addr;
2035         struct iova *iova;
2036         size_t size = 0;
2037         void *addr;
2038         struct scatterlist *sg;
2039
2040         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2041                 return;
2042
2043         domain = find_domain(pdev);
2044
2045         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2046         if (!iova)
2047                 return;
2048         for_each_sg(sglist, sg, nelems, i) {
2049                 addr = SG_ENT_VIRT_ADDRESS(sg);
2050                 size += aligned_size((u64)addr, sg->length);
2051         }
2052
2053         start_addr = iova->pfn_lo << PAGE_SHIFT;
2054
2055         /*  clear the whole page */
2056         dma_pte_clear_range(domain, start_addr, start_addr + size);
2057         /* free page tables */
2058         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2059
2060         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2061                         size >> VTD_PAGE_SHIFT, 0))
2062                 iommu_flush_write_buffer(domain->iommu);
2063
2064         /* free iova */
2065         __free_iova(&domain->iovad, iova);
2066 }
2067
2068 static int intel_nontranslate_map_sg(struct device *hddev,
2069         struct scatterlist *sglist, int nelems, int dir)
2070 {
2071         int i;
2072         struct scatterlist *sg;
2073
2074         for_each_sg(sglist, sg, nelems, i) {
2075                 BUG_ON(!sg_page(sg));
2076                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2077                 sg->dma_length = sg->length;
2078         }
2079         return nelems;
2080 }
2081
2082 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2083                  int dir)
2084 {
2085         void *addr;
2086         int i;
2087         struct pci_dev *pdev = to_pci_dev(hwdev);
2088         struct dmar_domain *domain;
2089         size_t size = 0;
2090         int prot = 0;
2091         size_t offset = 0;
2092         struct iova *iova = NULL;
2093         int ret;
2094         struct scatterlist *sg;
2095         unsigned long start_addr;
2096
2097         BUG_ON(dir == DMA_NONE);
2098         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2099                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2100
2101         domain = get_valid_domain_for_dev(pdev);
2102         if (!domain)
2103                 return 0;
2104
2105         for_each_sg(sglist, sg, nelems, i) {
2106                 addr = SG_ENT_VIRT_ADDRESS(sg);
2107                 addr = (void *)virt_to_phys(addr);
2108                 size += aligned_size((u64)addr, sg->length);
2109         }
2110
2111         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2112         if (!iova) {
2113                 sglist->dma_length = 0;
2114                 return 0;
2115         }
2116
2117         /*
2118          * Check if DMAR supports zero-length reads on write only
2119          * mappings..
2120          */
2121         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2122                         !cap_zlr(domain->iommu->cap))
2123                 prot |= DMA_PTE_READ;
2124         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2125                 prot |= DMA_PTE_WRITE;
2126
2127         start_addr = iova->pfn_lo << PAGE_SHIFT;
2128         offset = 0;
2129         for_each_sg(sglist, sg, nelems, i) {
2130                 addr = SG_ENT_VIRT_ADDRESS(sg);
2131                 addr = (void *)virt_to_phys(addr);
2132                 size = aligned_size((u64)addr, sg->length);
2133                 ret = domain_page_mapping(domain, start_addr + offset,
2134                         ((u64)addr) & PAGE_MASK,
2135                         size, prot);
2136                 if (ret) {
2137                         /*  clear the page */
2138                         dma_pte_clear_range(domain, start_addr,
2139                                   start_addr + offset);
2140                         /* free page tables */
2141                         dma_pte_free_pagetable(domain, start_addr,
2142                                   start_addr + offset);
2143                         /* free iova */
2144                         __free_iova(&domain->iovad, iova);
2145                         return 0;
2146                 }
2147                 sg->dma_address = start_addr + offset +
2148                                 ((u64)addr & (~PAGE_MASK));
2149                 sg->dma_length = sg->length;
2150                 offset += size;
2151         }
2152
2153         /* it's a non-present to present mapping */
2154         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2155                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2156                 iommu_flush_write_buffer(domain->iommu);
2157         return nelems;
2158 }
2159
2160 static struct dma_mapping_ops intel_dma_ops = {
2161         .alloc_coherent = intel_alloc_coherent,
2162         .free_coherent = intel_free_coherent,
2163         .map_single = intel_map_single,
2164         .unmap_single = intel_unmap_single,
2165         .map_sg = intel_map_sg,
2166         .unmap_sg = intel_unmap_sg,
2167 };
2168
2169 static inline int iommu_domain_cache_init(void)
2170 {
2171         int ret = 0;
2172
2173         iommu_domain_cache = kmem_cache_create("iommu_domain",
2174                                          sizeof(struct dmar_domain),
2175                                          0,
2176                                          SLAB_HWCACHE_ALIGN,
2177
2178                                          NULL);
2179         if (!iommu_domain_cache) {
2180                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2181                 ret = -ENOMEM;
2182         }
2183
2184         return ret;
2185 }
2186
2187 static inline int iommu_devinfo_cache_init(void)
2188 {
2189         int ret = 0;
2190
2191         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2192                                          sizeof(struct device_domain_info),
2193                                          0,
2194                                          SLAB_HWCACHE_ALIGN,
2195                                          NULL);
2196         if (!iommu_devinfo_cache) {
2197                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2198                 ret = -ENOMEM;
2199         }
2200
2201         return ret;
2202 }
2203
2204 static inline int iommu_iova_cache_init(void)
2205 {
2206         int ret = 0;
2207
2208         iommu_iova_cache = kmem_cache_create("iommu_iova",
2209                                          sizeof(struct iova),
2210                                          0,
2211                                          SLAB_HWCACHE_ALIGN,
2212                                          NULL);
2213         if (!iommu_iova_cache) {
2214                 printk(KERN_ERR "Couldn't create iova cache\n");
2215                 ret = -ENOMEM;
2216         }
2217
2218         return ret;
2219 }
2220
2221 static int __init iommu_init_mempool(void)
2222 {
2223         int ret;
2224         ret = iommu_iova_cache_init();
2225         if (ret)
2226                 return ret;
2227
2228         ret = iommu_domain_cache_init();
2229         if (ret)
2230                 goto domain_error;
2231
2232         ret = iommu_devinfo_cache_init();
2233         if (!ret)
2234                 return ret;
2235
2236         kmem_cache_destroy(iommu_domain_cache);
2237 domain_error:
2238         kmem_cache_destroy(iommu_iova_cache);
2239
2240         return -ENOMEM;
2241 }
2242
2243 static void __init iommu_exit_mempool(void)
2244 {
2245         kmem_cache_destroy(iommu_devinfo_cache);
2246         kmem_cache_destroy(iommu_domain_cache);
2247         kmem_cache_destroy(iommu_iova_cache);
2248
2249 }
2250
2251 static void __init init_no_remapping_devices(void)
2252 {
2253         struct dmar_drhd_unit *drhd;
2254
2255         for_each_drhd_unit(drhd) {
2256                 if (!drhd->include_all) {
2257                         int i;
2258                         for (i = 0; i < drhd->devices_cnt; i++)
2259                                 if (drhd->devices[i] != NULL)
2260                                         break;
2261                         /* ignore DMAR unit if no pci devices exist */
2262                         if (i == drhd->devices_cnt)
2263                                 drhd->ignored = 1;
2264                 }
2265         }
2266
2267         if (dmar_map_gfx)
2268                 return;
2269
2270         for_each_drhd_unit(drhd) {
2271                 int i;
2272                 if (drhd->ignored || drhd->include_all)
2273                         continue;
2274
2275                 for (i = 0; i < drhd->devices_cnt; i++)
2276                         if (drhd->devices[i] &&
2277                                 !IS_GFX_DEVICE(drhd->devices[i]))
2278                                 break;
2279
2280                 if (i < drhd->devices_cnt)
2281                         continue;
2282
2283                 /* bypass IOMMU if it is just for gfx devices */
2284                 drhd->ignored = 1;
2285                 for (i = 0; i < drhd->devices_cnt; i++) {
2286                         if (!drhd->devices[i])
2287                                 continue;
2288                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2289                 }
2290         }
2291 }
2292
2293 int __init intel_iommu_init(void)
2294 {
2295         int ret = 0;
2296
2297         if (dmar_table_init())
2298                 return  -ENODEV;
2299
2300         if (dmar_dev_scope_init())
2301                 return  -ENODEV;
2302
2303         /*
2304          * Check the need for DMA-remapping initialization now.
2305          * Above initialization will also be used by Interrupt-remapping.
2306          */
2307         if (no_iommu || swiotlb || dmar_disabled)
2308                 return -ENODEV;
2309
2310         iommu_init_mempool();
2311         dmar_init_reserved_ranges();
2312
2313         init_no_remapping_devices();
2314
2315         ret = init_dmars();
2316         if (ret) {
2317                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2318                 put_iova_domain(&reserved_iova_list);
2319                 iommu_exit_mempool();
2320                 return ret;
2321         }
2322         printk(KERN_INFO
2323         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2324
2325         init_timer(&unmap_timer);
2326         force_iommu = 1;
2327         dma_ops = &intel_dma_ops;
2328         return 0;
2329 }
2330
2331 void intel_iommu_domain_exit(struct dmar_domain *domain)
2332 {
2333         u64 end;
2334
2335         /* Domain 0 is reserved, so dont process it */
2336         if (!domain)
2337                 return;
2338
2339         end = DOMAIN_MAX_ADDR(domain->gaw);
2340         end = end & (~VTD_PAGE_MASK);
2341
2342         /* clear ptes */
2343         dma_pte_clear_range(domain, 0, end);
2344
2345         /* free page tables */
2346         dma_pte_free_pagetable(domain, 0, end);
2347
2348         iommu_free_domain(domain);
2349         free_domain_mem(domain);
2350 }
2351 EXPORT_SYMBOL_GPL(intel_iommu_domain_exit);
2352
2353 struct dmar_domain *intel_iommu_domain_alloc(struct pci_dev *pdev)
2354 {
2355         struct dmar_drhd_unit *drhd;
2356         struct dmar_domain *domain;
2357         struct intel_iommu *iommu;
2358
2359         drhd = dmar_find_matched_drhd_unit(pdev);
2360         if (!drhd) {
2361                 printk(KERN_ERR "intel_iommu_domain_alloc: drhd == NULL\n");
2362                 return NULL;
2363         }
2364
2365         iommu = drhd->iommu;
2366         if (!iommu) {
2367                 printk(KERN_ERR
2368                         "intel_iommu_domain_alloc: iommu == NULL\n");
2369                 return NULL;
2370         }
2371         domain = iommu_alloc_domain(iommu);
2372         if (!domain) {
2373                 printk(KERN_ERR
2374                         "intel_iommu_domain_alloc: domain == NULL\n");
2375                 return NULL;
2376         }
2377         if (domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2378                 printk(KERN_ERR
2379                         "intel_iommu_domain_alloc: domain_init() failed\n");
2380                 intel_iommu_domain_exit(domain);
2381                 return NULL;
2382         }
2383         return domain;
2384 }
2385 EXPORT_SYMBOL_GPL(intel_iommu_domain_alloc);
2386
2387 int intel_iommu_context_mapping(
2388         struct dmar_domain *domain, struct pci_dev *pdev)
2389 {
2390         int rc;
2391         rc = domain_context_mapping(domain, pdev);
2392         return rc;
2393 }
2394 EXPORT_SYMBOL_GPL(intel_iommu_context_mapping);
2395
2396 int intel_iommu_page_mapping(
2397         struct dmar_domain *domain, dma_addr_t iova,
2398         u64 hpa, size_t size, int prot)
2399 {
2400         int rc;
2401         rc = domain_page_mapping(domain, iova, hpa, size, prot);
2402         return rc;
2403 }
2404 EXPORT_SYMBOL_GPL(intel_iommu_page_mapping);
2405
2406 void intel_iommu_detach_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
2407 {
2408         detach_domain_for_dev(domain, bus, devfn);
2409 }
2410 EXPORT_SYMBOL_GPL(intel_iommu_detach_dev);
2411
2412 struct dmar_domain *
2413 intel_iommu_find_domain(struct pci_dev *pdev)
2414 {
2415         return find_domain(pdev);
2416 }
2417 EXPORT_SYMBOL_GPL(intel_iommu_find_domain);
2418
2419 int intel_iommu_found(void)
2420 {
2421         return g_num_of_iommus;
2422 }
2423 EXPORT_SYMBOL_GPL(intel_iommu_found);
2424
2425 u64 intel_iommu_iova_to_pfn(struct dmar_domain *domain, u64 iova)
2426 {
2427         struct dma_pte *pte;
2428         u64 pfn;
2429
2430         pfn = 0;
2431         pte = addr_to_dma_pte(domain, iova);
2432
2433         if (pte)
2434                 pfn = dma_pte_addr(*pte);
2435
2436         return pfn >> VTD_PAGE_SHIFT;
2437 }
2438 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_pfn);