Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/debugfs.h>
26 #include <linux/slab.h>
27 #include <linux/irq.h>
28 #include <linux/interrupt.h>
29 #include <linux/sysdev.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include "iova.h"
37 #include "intel-iommu.h"
38 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
44 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
45
46 #define IOAPIC_RANGE_START      (0xfee00000)
47 #define IOAPIC_RANGE_END        (0xfeefffff)
48 #define IOVA_START_ADDR         (0x1000)
49
50 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
51
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53
54
55 static void flush_unmaps_timeout(unsigned long data);
56
57 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
58
59 #define HIGH_WATER_MARK 250
60 struct deferred_flush_tables {
61         int next;
62         struct iova *iova[HIGH_WATER_MARK];
63         struct dmar_domain *domain[HIGH_WATER_MARK];
64 };
65
66 static struct deferred_flush_tables *deferred_flush;
67
68 /* bitmap for indexing intel_iommus */
69 static int g_num_of_iommus;
70
71 static DEFINE_SPINLOCK(async_umap_flush_lock);
72 static LIST_HEAD(unmaps_to_do);
73
74 static int timer_on;
75 static long list_size;
76
77 static void domain_remove_dev_info(struct dmar_domain *domain);
78
79 int dmar_disabled;
80 static int __initdata dmar_map_gfx = 1;
81 static int dmar_forcedac;
82 static int intel_iommu_strict;
83
84 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
85 static DEFINE_SPINLOCK(device_domain_lock);
86 static LIST_HEAD(device_domain_list);
87
88 static int __init intel_iommu_setup(char *str)
89 {
90         if (!str)
91                 return -EINVAL;
92         while (*str) {
93                 if (!strncmp(str, "off", 3)) {
94                         dmar_disabled = 1;
95                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
96                 } else if (!strncmp(str, "igfx_off", 8)) {
97                         dmar_map_gfx = 0;
98                         printk(KERN_INFO
99                                 "Intel-IOMMU: disable GFX device mapping\n");
100                 } else if (!strncmp(str, "forcedac", 8)) {
101                         printk(KERN_INFO
102                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
103                         dmar_forcedac = 1;
104                 } else if (!strncmp(str, "strict", 6)) {
105                         printk(KERN_INFO
106                                 "Intel-IOMMU: disable batched IOTLB flush\n");
107                         intel_iommu_strict = 1;
108                 }
109
110                 str += strcspn(str, ",");
111                 while (*str == ',')
112                         str++;
113         }
114         return 0;
115 }
116 __setup("intel_iommu=", intel_iommu_setup);
117
118 static struct kmem_cache *iommu_domain_cache;
119 static struct kmem_cache *iommu_devinfo_cache;
120 static struct kmem_cache *iommu_iova_cache;
121
122 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
123 {
124         unsigned int flags;
125         void *vaddr;
126
127         /* trying to avoid low memory issues */
128         flags = current->flags & PF_MEMALLOC;
129         current->flags |= PF_MEMALLOC;
130         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
131         current->flags &= (~PF_MEMALLOC | flags);
132         return vaddr;
133 }
134
135
136 static inline void *alloc_pgtable_page(void)
137 {
138         unsigned int flags;
139         void *vaddr;
140
141         /* trying to avoid low memory issues */
142         flags = current->flags & PF_MEMALLOC;
143         current->flags |= PF_MEMALLOC;
144         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
145         current->flags &= (~PF_MEMALLOC | flags);
146         return vaddr;
147 }
148
149 static inline void free_pgtable_page(void *vaddr)
150 {
151         free_page((unsigned long)vaddr);
152 }
153
154 static inline void *alloc_domain_mem(void)
155 {
156         return iommu_kmem_cache_alloc(iommu_domain_cache);
157 }
158
159 static inline void free_domain_mem(void *vaddr)
160 {
161         kmem_cache_free(iommu_domain_cache, vaddr);
162 }
163
164 static inline void * alloc_devinfo_mem(void)
165 {
166         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
167 }
168
169 static inline void free_devinfo_mem(void *vaddr)
170 {
171         kmem_cache_free(iommu_devinfo_cache, vaddr);
172 }
173
174 struct iova *alloc_iova_mem(void)
175 {
176         return iommu_kmem_cache_alloc(iommu_iova_cache);
177 }
178
179 void free_iova_mem(struct iova *iova)
180 {
181         kmem_cache_free(iommu_iova_cache, iova);
182 }
183
184 /* Gets context entry for a given bus and devfn */
185 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
186                 u8 bus, u8 devfn)
187 {
188         struct root_entry *root;
189         struct context_entry *context;
190         unsigned long phy_addr;
191         unsigned long flags;
192
193         spin_lock_irqsave(&iommu->lock, flags);
194         root = &iommu->root_entry[bus];
195         context = get_context_addr_from_root(root);
196         if (!context) {
197                 context = (struct context_entry *)alloc_pgtable_page();
198                 if (!context) {
199                         spin_unlock_irqrestore(&iommu->lock, flags);
200                         return NULL;
201                 }
202                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
203                 phy_addr = virt_to_phys((void *)context);
204                 set_root_value(root, phy_addr);
205                 set_root_present(root);
206                 __iommu_flush_cache(iommu, root, sizeof(*root));
207         }
208         spin_unlock_irqrestore(&iommu->lock, flags);
209         return &context[devfn];
210 }
211
212 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
213 {
214         struct root_entry *root;
215         struct context_entry *context;
216         int ret;
217         unsigned long flags;
218
219         spin_lock_irqsave(&iommu->lock, flags);
220         root = &iommu->root_entry[bus];
221         context = get_context_addr_from_root(root);
222         if (!context) {
223                 ret = 0;
224                 goto out;
225         }
226         ret = context_present(context[devfn]);
227 out:
228         spin_unlock_irqrestore(&iommu->lock, flags);
229         return ret;
230 }
231
232 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
233 {
234         struct root_entry *root;
235         struct context_entry *context;
236         unsigned long flags;
237
238         spin_lock_irqsave(&iommu->lock, flags);
239         root = &iommu->root_entry[bus];
240         context = get_context_addr_from_root(root);
241         if (context) {
242                 context_clear_entry(context[devfn]);
243                 __iommu_flush_cache(iommu, &context[devfn], \
244                         sizeof(*context));
245         }
246         spin_unlock_irqrestore(&iommu->lock, flags);
247 }
248
249 static void free_context_table(struct intel_iommu *iommu)
250 {
251         struct root_entry *root;
252         int i;
253         unsigned long flags;
254         struct context_entry *context;
255
256         spin_lock_irqsave(&iommu->lock, flags);
257         if (!iommu->root_entry) {
258                 goto out;
259         }
260         for (i = 0; i < ROOT_ENTRY_NR; i++) {
261                 root = &iommu->root_entry[i];
262                 context = get_context_addr_from_root(root);
263                 if (context)
264                         free_pgtable_page(context);
265         }
266         free_pgtable_page(iommu->root_entry);
267         iommu->root_entry = NULL;
268 out:
269         spin_unlock_irqrestore(&iommu->lock, flags);
270 }
271
272 /* page table handling */
273 #define LEVEL_STRIDE            (9)
274 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
275
276 static inline int agaw_to_level(int agaw)
277 {
278         return agaw + 2;
279 }
280
281 static inline int agaw_to_width(int agaw)
282 {
283         return 30 + agaw * LEVEL_STRIDE;
284
285 }
286
287 static inline int width_to_agaw(int width)
288 {
289         return (width - 30) / LEVEL_STRIDE;
290 }
291
292 static inline unsigned int level_to_offset_bits(int level)
293 {
294         return (12 + (level - 1) * LEVEL_STRIDE);
295 }
296
297 static inline int address_level_offset(u64 addr, int level)
298 {
299         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
300 }
301
302 static inline u64 level_mask(int level)
303 {
304         return ((u64)-1 << level_to_offset_bits(level));
305 }
306
307 static inline u64 level_size(int level)
308 {
309         return ((u64)1 << level_to_offset_bits(level));
310 }
311
312 static inline u64 align_to_level(u64 addr, int level)
313 {
314         return ((addr + level_size(level) - 1) & level_mask(level));
315 }
316
317 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
318 {
319         int addr_width = agaw_to_width(domain->agaw);
320         struct dma_pte *parent, *pte = NULL;
321         int level = agaw_to_level(domain->agaw);
322         int offset;
323         unsigned long flags;
324
325         BUG_ON(!domain->pgd);
326
327         addr &= (((u64)1) << addr_width) - 1;
328         parent = domain->pgd;
329
330         spin_lock_irqsave(&domain->mapping_lock, flags);
331         while (level > 0) {
332                 void *tmp_page;
333
334                 offset = address_level_offset(addr, level);
335                 pte = &parent[offset];
336                 if (level == 1)
337                         break;
338
339                 if (!dma_pte_present(*pte)) {
340                         tmp_page = alloc_pgtable_page();
341
342                         if (!tmp_page) {
343                                 spin_unlock_irqrestore(&domain->mapping_lock,
344                                         flags);
345                                 return NULL;
346                         }
347                         __iommu_flush_cache(domain->iommu, tmp_page,
348                                         PAGE_SIZE_4K);
349                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
350                         /*
351                          * high level table always sets r/w, last level page
352                          * table control read/write
353                          */
354                         dma_set_pte_readable(*pte);
355                         dma_set_pte_writable(*pte);
356                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
357                 }
358                 parent = phys_to_virt(dma_pte_addr(*pte));
359                 level--;
360         }
361
362         spin_unlock_irqrestore(&domain->mapping_lock, flags);
363         return pte;
364 }
365
366 /* return address's pte at specific level */
367 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
368                 int level)
369 {
370         struct dma_pte *parent, *pte = NULL;
371         int total = agaw_to_level(domain->agaw);
372         int offset;
373
374         parent = domain->pgd;
375         while (level <= total) {
376                 offset = address_level_offset(addr, total);
377                 pte = &parent[offset];
378                 if (level == total)
379                         return pte;
380
381                 if (!dma_pte_present(*pte))
382                         break;
383                 parent = phys_to_virt(dma_pte_addr(*pte));
384                 total--;
385         }
386         return NULL;
387 }
388
389 /* clear one page's page table */
390 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
391 {
392         struct dma_pte *pte = NULL;
393
394         /* get last level pte */
395         pte = dma_addr_level_pte(domain, addr, 1);
396
397         if (pte) {
398                 dma_clear_pte(*pte);
399                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
400         }
401 }
402
403 /* clear last level pte, a tlb flush should be followed */
404 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
405 {
406         int addr_width = agaw_to_width(domain->agaw);
407
408         start &= (((u64)1) << addr_width) - 1;
409         end &= (((u64)1) << addr_width) - 1;
410         /* in case it's partial page */
411         start = PAGE_ALIGN_4K(start);
412         end &= PAGE_MASK_4K;
413
414         /* we don't need lock here, nobody else touches the iova range */
415         while (start < end) {
416                 dma_pte_clear_one(domain, start);
417                 start += PAGE_SIZE_4K;
418         }
419 }
420
421 /* free page table pages. last level pte should already be cleared */
422 static void dma_pte_free_pagetable(struct dmar_domain *domain,
423         u64 start, u64 end)
424 {
425         int addr_width = agaw_to_width(domain->agaw);
426         struct dma_pte *pte;
427         int total = agaw_to_level(domain->agaw);
428         int level;
429         u64 tmp;
430
431         start &= (((u64)1) << addr_width) - 1;
432         end &= (((u64)1) << addr_width) - 1;
433
434         /* we don't need lock here, nobody else touches the iova range */
435         level = 2;
436         while (level <= total) {
437                 tmp = align_to_level(start, level);
438                 if (tmp >= end || (tmp + level_size(level) > end))
439                         return;
440
441                 while (tmp < end) {
442                         pte = dma_addr_level_pte(domain, tmp, level);
443                         if (pte) {
444                                 free_pgtable_page(
445                                         phys_to_virt(dma_pte_addr(*pte)));
446                                 dma_clear_pte(*pte);
447                                 __iommu_flush_cache(domain->iommu,
448                                                 pte, sizeof(*pte));
449                         }
450                         tmp += level_size(level);
451                 }
452                 level++;
453         }
454         /* free pgd */
455         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
456                 free_pgtable_page(domain->pgd);
457                 domain->pgd = NULL;
458         }
459 }
460
461 /* iommu handling */
462 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
463 {
464         struct root_entry *root;
465         unsigned long flags;
466
467         root = (struct root_entry *)alloc_pgtable_page();
468         if (!root)
469                 return -ENOMEM;
470
471         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
472
473         spin_lock_irqsave(&iommu->lock, flags);
474         iommu->root_entry = root;
475         spin_unlock_irqrestore(&iommu->lock, flags);
476
477         return 0;
478 }
479
480 static void iommu_set_root_entry(struct intel_iommu *iommu)
481 {
482         void *addr;
483         u32 cmd, sts;
484         unsigned long flag;
485
486         addr = iommu->root_entry;
487
488         spin_lock_irqsave(&iommu->register_lock, flag);
489         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
490
491         cmd = iommu->gcmd | DMA_GCMD_SRTP;
492         writel(cmd, iommu->reg + DMAR_GCMD_REG);
493
494         /* Make sure hardware complete it */
495         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
496                 readl, (sts & DMA_GSTS_RTPS), sts);
497
498         spin_unlock_irqrestore(&iommu->register_lock, flag);
499 }
500
501 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
502 {
503         u32 val;
504         unsigned long flag;
505
506         if (!cap_rwbf(iommu->cap))
507                 return;
508         val = iommu->gcmd | DMA_GCMD_WBF;
509
510         spin_lock_irqsave(&iommu->register_lock, flag);
511         writel(val, iommu->reg + DMAR_GCMD_REG);
512
513         /* Make sure hardware complete it */
514         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
515                         readl, (!(val & DMA_GSTS_WBFS)), val);
516
517         spin_unlock_irqrestore(&iommu->register_lock, flag);
518 }
519
520 /* return value determine if we need a write buffer flush */
521 static int __iommu_flush_context(struct intel_iommu *iommu,
522         u16 did, u16 source_id, u8 function_mask, u64 type,
523         int non_present_entry_flush)
524 {
525         u64 val = 0;
526         unsigned long flag;
527
528         /*
529          * In the non-present entry flush case, if hardware doesn't cache
530          * non-present entry we do nothing and if hardware cache non-present
531          * entry, we flush entries of domain 0 (the domain id is used to cache
532          * any non-present entries)
533          */
534         if (non_present_entry_flush) {
535                 if (!cap_caching_mode(iommu->cap))
536                         return 1;
537                 else
538                         did = 0;
539         }
540
541         switch (type) {
542         case DMA_CCMD_GLOBAL_INVL:
543                 val = DMA_CCMD_GLOBAL_INVL;
544                 break;
545         case DMA_CCMD_DOMAIN_INVL:
546                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
547                 break;
548         case DMA_CCMD_DEVICE_INVL:
549                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
550                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
551                 break;
552         default:
553                 BUG();
554         }
555         val |= DMA_CCMD_ICC;
556
557         spin_lock_irqsave(&iommu->register_lock, flag);
558         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
559
560         /* Make sure hardware complete it */
561         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
562                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
563
564         spin_unlock_irqrestore(&iommu->register_lock, flag);
565
566         /* flush context entry will implictly flush write buffer */
567         return 0;
568 }
569
570 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
571         int non_present_entry_flush)
572 {
573         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
574                 non_present_entry_flush);
575 }
576
577 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
578         int non_present_entry_flush)
579 {
580         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
581                 non_present_entry_flush);
582 }
583
584 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
585         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
586 {
587         return __iommu_flush_context(iommu, did, source_id, function_mask,
588                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
589 }
590
591 /* return value determine if we need a write buffer flush */
592 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
593         u64 addr, unsigned int size_order, u64 type,
594         int non_present_entry_flush)
595 {
596         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
597         u64 val = 0, val_iva = 0;
598         unsigned long flag;
599
600         /*
601          * In the non-present entry flush case, if hardware doesn't cache
602          * non-present entry we do nothing and if hardware cache non-present
603          * entry, we flush entries of domain 0 (the domain id is used to cache
604          * any non-present entries)
605          */
606         if (non_present_entry_flush) {
607                 if (!cap_caching_mode(iommu->cap))
608                         return 1;
609                 else
610                         did = 0;
611         }
612
613         switch (type) {
614         case DMA_TLB_GLOBAL_FLUSH:
615                 /* global flush doesn't need set IVA_REG */
616                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
617                 break;
618         case DMA_TLB_DSI_FLUSH:
619                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
620                 break;
621         case DMA_TLB_PSI_FLUSH:
622                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
623                 /* Note: always flush non-leaf currently */
624                 val_iva = size_order | addr;
625                 break;
626         default:
627                 BUG();
628         }
629         /* Note: set drain read/write */
630 #if 0
631         /*
632          * This is probably to be super secure.. Looks like we can
633          * ignore it without any impact.
634          */
635         if (cap_read_drain(iommu->cap))
636                 val |= DMA_TLB_READ_DRAIN;
637 #endif
638         if (cap_write_drain(iommu->cap))
639                 val |= DMA_TLB_WRITE_DRAIN;
640
641         spin_lock_irqsave(&iommu->register_lock, flag);
642         /* Note: Only uses first TLB reg currently */
643         if (val_iva)
644                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
645         dmar_writeq(iommu->reg + tlb_offset + 8, val);
646
647         /* Make sure hardware complete it */
648         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
649                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
650
651         spin_unlock_irqrestore(&iommu->register_lock, flag);
652
653         /* check IOTLB invalidation granularity */
654         if (DMA_TLB_IAIG(val) == 0)
655                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
656         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
657                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
658                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
659         /* flush context entry will implictly flush write buffer */
660         return 0;
661 }
662
663 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
664         int non_present_entry_flush)
665 {
666         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
667                 non_present_entry_flush);
668 }
669
670 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
671         int non_present_entry_flush)
672 {
673         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
674                 non_present_entry_flush);
675 }
676
677 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
678         u64 addr, unsigned int pages, int non_present_entry_flush)
679 {
680         unsigned int mask;
681
682         BUG_ON(addr & (~PAGE_MASK_4K));
683         BUG_ON(pages == 0);
684
685         /* Fallback to domain selective flush if no PSI support */
686         if (!cap_pgsel_inv(iommu->cap))
687                 return iommu_flush_iotlb_dsi(iommu, did,
688                         non_present_entry_flush);
689
690         /*
691          * PSI requires page size to be 2 ^ x, and the base address is naturally
692          * aligned to the size
693          */
694         mask = ilog2(__roundup_pow_of_two(pages));
695         /* Fallback to domain selective flush if size is too big */
696         if (mask > cap_max_amask_val(iommu->cap))
697                 return iommu_flush_iotlb_dsi(iommu, did,
698                         non_present_entry_flush);
699
700         return __iommu_flush_iotlb(iommu, did, addr, mask,
701                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
702 }
703
704 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
705 {
706         u32 pmen;
707         unsigned long flags;
708
709         spin_lock_irqsave(&iommu->register_lock, flags);
710         pmen = readl(iommu->reg + DMAR_PMEN_REG);
711         pmen &= ~DMA_PMEN_EPM;
712         writel(pmen, iommu->reg + DMAR_PMEN_REG);
713
714         /* wait for the protected region status bit to clear */
715         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
716                 readl, !(pmen & DMA_PMEN_PRS), pmen);
717
718         spin_unlock_irqrestore(&iommu->register_lock, flags);
719 }
720
721 static int iommu_enable_translation(struct intel_iommu *iommu)
722 {
723         u32 sts;
724         unsigned long flags;
725
726         spin_lock_irqsave(&iommu->register_lock, flags);
727         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
728
729         /* Make sure hardware complete it */
730         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
731                 readl, (sts & DMA_GSTS_TES), sts);
732
733         iommu->gcmd |= DMA_GCMD_TE;
734         spin_unlock_irqrestore(&iommu->register_lock, flags);
735         return 0;
736 }
737
738 static int iommu_disable_translation(struct intel_iommu *iommu)
739 {
740         u32 sts;
741         unsigned long flag;
742
743         spin_lock_irqsave(&iommu->register_lock, flag);
744         iommu->gcmd &= ~DMA_GCMD_TE;
745         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
746
747         /* Make sure hardware complete it */
748         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
749                 readl, (!(sts & DMA_GSTS_TES)), sts);
750
751         spin_unlock_irqrestore(&iommu->register_lock, flag);
752         return 0;
753 }
754
755 /* iommu interrupt handling. Most stuff are MSI-like. */
756
757 static const char *fault_reason_strings[] =
758 {
759         "Software",
760         "Present bit in root entry is clear",
761         "Present bit in context entry is clear",
762         "Invalid context entry",
763         "Access beyond MGAW",
764         "PTE Write access is not set",
765         "PTE Read access is not set",
766         "Next page table ptr is invalid",
767         "Root table address invalid",
768         "Context table ptr is invalid",
769         "non-zero reserved fields in RTP",
770         "non-zero reserved fields in CTP",
771         "non-zero reserved fields in PTE",
772 };
773 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
774
775 const char *dmar_get_fault_reason(u8 fault_reason)
776 {
777         if (fault_reason > MAX_FAULT_REASON_IDX)
778                 return "Unknown";
779         else
780                 return fault_reason_strings[fault_reason];
781 }
782
783 void dmar_msi_unmask(unsigned int irq)
784 {
785         struct intel_iommu *iommu = get_irq_data(irq);
786         unsigned long flag;
787
788         /* unmask it */
789         spin_lock_irqsave(&iommu->register_lock, flag);
790         writel(0, iommu->reg + DMAR_FECTL_REG);
791         /* Read a reg to force flush the post write */
792         readl(iommu->reg + DMAR_FECTL_REG);
793         spin_unlock_irqrestore(&iommu->register_lock, flag);
794 }
795
796 void dmar_msi_mask(unsigned int irq)
797 {
798         unsigned long flag;
799         struct intel_iommu *iommu = get_irq_data(irq);
800
801         /* mask it */
802         spin_lock_irqsave(&iommu->register_lock, flag);
803         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
804         /* Read a reg to force flush the post write */
805         readl(iommu->reg + DMAR_FECTL_REG);
806         spin_unlock_irqrestore(&iommu->register_lock, flag);
807 }
808
809 void dmar_msi_write(int irq, struct msi_msg *msg)
810 {
811         struct intel_iommu *iommu = get_irq_data(irq);
812         unsigned long flag;
813
814         spin_lock_irqsave(&iommu->register_lock, flag);
815         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
816         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
817         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
818         spin_unlock_irqrestore(&iommu->register_lock, flag);
819 }
820
821 void dmar_msi_read(int irq, struct msi_msg *msg)
822 {
823         struct intel_iommu *iommu = get_irq_data(irq);
824         unsigned long flag;
825
826         spin_lock_irqsave(&iommu->register_lock, flag);
827         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
828         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
829         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
830         spin_unlock_irqrestore(&iommu->register_lock, flag);
831 }
832
833 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
834                 u8 fault_reason, u16 source_id, u64 addr)
835 {
836         const char *reason;
837
838         reason = dmar_get_fault_reason(fault_reason);
839
840         printk(KERN_ERR
841                 "DMAR:[%s] Request device [%02x:%02x.%d] "
842                 "fault addr %llx \n"
843                 "DMAR:[fault reason %02d] %s\n",
844                 (type ? "DMA Read" : "DMA Write"),
845                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
846                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
847         return 0;
848 }
849
850 #define PRIMARY_FAULT_REG_LEN (16)
851 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
852 {
853         struct intel_iommu *iommu = dev_id;
854         int reg, fault_index;
855         u32 fault_status;
856         unsigned long flag;
857
858         spin_lock_irqsave(&iommu->register_lock, flag);
859         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
860
861         /* TBD: ignore advanced fault log currently */
862         if (!(fault_status & DMA_FSTS_PPF))
863                 goto clear_overflow;
864
865         fault_index = dma_fsts_fault_record_index(fault_status);
866         reg = cap_fault_reg_offset(iommu->cap);
867         while (1) {
868                 u8 fault_reason;
869                 u16 source_id;
870                 u64 guest_addr;
871                 int type;
872                 u32 data;
873
874                 /* highest 32 bits */
875                 data = readl(iommu->reg + reg +
876                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
877                 if (!(data & DMA_FRCD_F))
878                         break;
879
880                 fault_reason = dma_frcd_fault_reason(data);
881                 type = dma_frcd_type(data);
882
883                 data = readl(iommu->reg + reg +
884                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
885                 source_id = dma_frcd_source_id(data);
886
887                 guest_addr = dmar_readq(iommu->reg + reg +
888                                 fault_index * PRIMARY_FAULT_REG_LEN);
889                 guest_addr = dma_frcd_page_addr(guest_addr);
890                 /* clear the fault */
891                 writel(DMA_FRCD_F, iommu->reg + reg +
892                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
893
894                 spin_unlock_irqrestore(&iommu->register_lock, flag);
895
896                 iommu_page_fault_do_one(iommu, type, fault_reason,
897                                 source_id, guest_addr);
898
899                 fault_index++;
900                 if (fault_index > cap_num_fault_regs(iommu->cap))
901                         fault_index = 0;
902                 spin_lock_irqsave(&iommu->register_lock, flag);
903         }
904 clear_overflow:
905         /* clear primary fault overflow */
906         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
907         if (fault_status & DMA_FSTS_PFO)
908                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
909
910         spin_unlock_irqrestore(&iommu->register_lock, flag);
911         return IRQ_HANDLED;
912 }
913
914 int dmar_set_interrupt(struct intel_iommu *iommu)
915 {
916         int irq, ret;
917
918         irq = create_irq();
919         if (!irq) {
920                 printk(KERN_ERR "IOMMU: no free vectors\n");
921                 return -EINVAL;
922         }
923
924         set_irq_data(irq, iommu);
925         iommu->irq = irq;
926
927         ret = arch_setup_dmar_msi(irq);
928         if (ret) {
929                 set_irq_data(irq, NULL);
930                 iommu->irq = 0;
931                 destroy_irq(irq);
932                 return 0;
933         }
934
935         /* Force fault register is cleared */
936         iommu_page_fault(irq, iommu);
937
938         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
939         if (ret)
940                 printk(KERN_ERR "IOMMU: can't request irq\n");
941         return ret;
942 }
943
944 static int iommu_init_domains(struct intel_iommu *iommu)
945 {
946         unsigned long ndomains;
947         unsigned long nlongs;
948
949         ndomains = cap_ndoms(iommu->cap);
950         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
951         nlongs = BITS_TO_LONGS(ndomains);
952
953         /* TBD: there might be 64K domains,
954          * consider other allocation for future chip
955          */
956         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
957         if (!iommu->domain_ids) {
958                 printk(KERN_ERR "Allocating domain id array failed\n");
959                 return -ENOMEM;
960         }
961         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
962                         GFP_KERNEL);
963         if (!iommu->domains) {
964                 printk(KERN_ERR "Allocating domain array failed\n");
965                 kfree(iommu->domain_ids);
966                 return -ENOMEM;
967         }
968
969         spin_lock_init(&iommu->lock);
970
971         /*
972          * if Caching mode is set, then invalid translations are tagged
973          * with domainid 0. Hence we need to pre-allocate it.
974          */
975         if (cap_caching_mode(iommu->cap))
976                 set_bit(0, iommu->domain_ids);
977         return 0;
978 }
979
980
981 static void domain_exit(struct dmar_domain *domain);
982
983 void free_dmar_iommu(struct intel_iommu *iommu)
984 {
985         struct dmar_domain *domain;
986         int i;
987
988         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
989         for (; i < cap_ndoms(iommu->cap); ) {
990                 domain = iommu->domains[i];
991                 clear_bit(i, iommu->domain_ids);
992                 domain_exit(domain);
993                 i = find_next_bit(iommu->domain_ids,
994                         cap_ndoms(iommu->cap), i+1);
995         }
996
997         if (iommu->gcmd & DMA_GCMD_TE)
998                 iommu_disable_translation(iommu);
999
1000         if (iommu->irq) {
1001                 set_irq_data(iommu->irq, NULL);
1002                 /* This will mask the irq */
1003                 free_irq(iommu->irq, iommu);
1004                 destroy_irq(iommu->irq);
1005         }
1006
1007         kfree(iommu->domains);
1008         kfree(iommu->domain_ids);
1009
1010         /* free context mapping */
1011         free_context_table(iommu);
1012 }
1013
1014 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1015 {
1016         unsigned long num;
1017         unsigned long ndomains;
1018         struct dmar_domain *domain;
1019         unsigned long flags;
1020
1021         domain = alloc_domain_mem();
1022         if (!domain)
1023                 return NULL;
1024
1025         ndomains = cap_ndoms(iommu->cap);
1026
1027         spin_lock_irqsave(&iommu->lock, flags);
1028         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1029         if (num >= ndomains) {
1030                 spin_unlock_irqrestore(&iommu->lock, flags);
1031                 free_domain_mem(domain);
1032                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1033                 return NULL;
1034         }
1035
1036         set_bit(num, iommu->domain_ids);
1037         domain->id = num;
1038         domain->iommu = iommu;
1039         iommu->domains[num] = domain;
1040         spin_unlock_irqrestore(&iommu->lock, flags);
1041
1042         return domain;
1043 }
1044
1045 static void iommu_free_domain(struct dmar_domain *domain)
1046 {
1047         unsigned long flags;
1048
1049         spin_lock_irqsave(&domain->iommu->lock, flags);
1050         clear_bit(domain->id, domain->iommu->domain_ids);
1051         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1052 }
1053
1054 static struct iova_domain reserved_iova_list;
1055 static struct lock_class_key reserved_alloc_key;
1056 static struct lock_class_key reserved_rbtree_key;
1057
1058 static void dmar_init_reserved_ranges(void)
1059 {
1060         struct pci_dev *pdev = NULL;
1061         struct iova *iova;
1062         int i;
1063         u64 addr, size;
1064
1065         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1066
1067         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1068                 &reserved_alloc_key);
1069         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1070                 &reserved_rbtree_key);
1071
1072         /* IOAPIC ranges shouldn't be accessed by DMA */
1073         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1074                 IOVA_PFN(IOAPIC_RANGE_END));
1075         if (!iova)
1076                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1077
1078         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1079         for_each_pci_dev(pdev) {
1080                 struct resource *r;
1081
1082                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1083                         r = &pdev->resource[i];
1084                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1085                                 continue;
1086                         addr = r->start;
1087                         addr &= PAGE_MASK_4K;
1088                         size = r->end - addr;
1089                         size = PAGE_ALIGN_4K(size);
1090                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1091                                 IOVA_PFN(size + addr) - 1);
1092                         if (!iova)
1093                                 printk(KERN_ERR "Reserve iova failed\n");
1094                 }
1095         }
1096
1097 }
1098
1099 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1100 {
1101         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1102 }
1103
1104 static inline int guestwidth_to_adjustwidth(int gaw)
1105 {
1106         int agaw;
1107         int r = (gaw - 12) % 9;
1108
1109         if (r == 0)
1110                 agaw = gaw;
1111         else
1112                 agaw = gaw + 9 - r;
1113         if (agaw > 64)
1114                 agaw = 64;
1115         return agaw;
1116 }
1117
1118 static int domain_init(struct dmar_domain *domain, int guest_width)
1119 {
1120         struct intel_iommu *iommu;
1121         int adjust_width, agaw;
1122         unsigned long sagaw;
1123
1124         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1125         spin_lock_init(&domain->mapping_lock);
1126
1127         domain_reserve_special_ranges(domain);
1128
1129         /* calculate AGAW */
1130         iommu = domain->iommu;
1131         if (guest_width > cap_mgaw(iommu->cap))
1132                 guest_width = cap_mgaw(iommu->cap);
1133         domain->gaw = guest_width;
1134         adjust_width = guestwidth_to_adjustwidth(guest_width);
1135         agaw = width_to_agaw(adjust_width);
1136         sagaw = cap_sagaw(iommu->cap);
1137         if (!test_bit(agaw, &sagaw)) {
1138                 /* hardware doesn't support it, choose a bigger one */
1139                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1140                 agaw = find_next_bit(&sagaw, 5, agaw);
1141                 if (agaw >= 5)
1142                         return -ENODEV;
1143         }
1144         domain->agaw = agaw;
1145         INIT_LIST_HEAD(&domain->devices);
1146
1147         /* always allocate the top pgd */
1148         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1149         if (!domain->pgd)
1150                 return -ENOMEM;
1151         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1152         return 0;
1153 }
1154
1155 static void domain_exit(struct dmar_domain *domain)
1156 {
1157         u64 end;
1158
1159         /* Domain 0 is reserved, so dont process it */
1160         if (!domain)
1161                 return;
1162
1163         domain_remove_dev_info(domain);
1164         /* destroy iovas */
1165         put_iova_domain(&domain->iovad);
1166         end = DOMAIN_MAX_ADDR(domain->gaw);
1167         end = end & (~PAGE_MASK_4K);
1168
1169         /* clear ptes */
1170         dma_pte_clear_range(domain, 0, end);
1171
1172         /* free page tables */
1173         dma_pte_free_pagetable(domain, 0, end);
1174
1175         iommu_free_domain(domain);
1176         free_domain_mem(domain);
1177 }
1178
1179 static int domain_context_mapping_one(struct dmar_domain *domain,
1180                 u8 bus, u8 devfn)
1181 {
1182         struct context_entry *context;
1183         struct intel_iommu *iommu = domain->iommu;
1184         unsigned long flags;
1185
1186         pr_debug("Set context mapping for %02x:%02x.%d\n",
1187                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1188         BUG_ON(!domain->pgd);
1189         context = device_to_context_entry(iommu, bus, devfn);
1190         if (!context)
1191                 return -ENOMEM;
1192         spin_lock_irqsave(&iommu->lock, flags);
1193         if (context_present(*context)) {
1194                 spin_unlock_irqrestore(&iommu->lock, flags);
1195                 return 0;
1196         }
1197
1198         context_set_domain_id(*context, domain->id);
1199         context_set_address_width(*context, domain->agaw);
1200         context_set_address_root(*context, virt_to_phys(domain->pgd));
1201         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1202         context_set_fault_enable(*context);
1203         context_set_present(*context);
1204         __iommu_flush_cache(iommu, context, sizeof(*context));
1205
1206         /* it's a non-present to present mapping */
1207         if (iommu_flush_context_device(iommu, domain->id,
1208                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1209                 iommu_flush_write_buffer(iommu);
1210         else
1211                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1212         spin_unlock_irqrestore(&iommu->lock, flags);
1213         return 0;
1214 }
1215
1216 static int
1217 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1218 {
1219         int ret;
1220         struct pci_dev *tmp, *parent;
1221
1222         ret = domain_context_mapping_one(domain, pdev->bus->number,
1223                 pdev->devfn);
1224         if (ret)
1225                 return ret;
1226
1227         /* dependent device mapping */
1228         tmp = pci_find_upstream_pcie_bridge(pdev);
1229         if (!tmp)
1230                 return 0;
1231         /* Secondary interface's bus number and devfn 0 */
1232         parent = pdev->bus->self;
1233         while (parent != tmp) {
1234                 ret = domain_context_mapping_one(domain, parent->bus->number,
1235                         parent->devfn);
1236                 if (ret)
1237                         return ret;
1238                 parent = parent->bus->self;
1239         }
1240         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1241                 return domain_context_mapping_one(domain,
1242                         tmp->subordinate->number, 0);
1243         else /* this is a legacy PCI bridge */
1244                 return domain_context_mapping_one(domain,
1245                         tmp->bus->number, tmp->devfn);
1246 }
1247
1248 static int domain_context_mapped(struct dmar_domain *domain,
1249         struct pci_dev *pdev)
1250 {
1251         int ret;
1252         struct pci_dev *tmp, *parent;
1253
1254         ret = device_context_mapped(domain->iommu,
1255                 pdev->bus->number, pdev->devfn);
1256         if (!ret)
1257                 return ret;
1258         /* dependent device mapping */
1259         tmp = pci_find_upstream_pcie_bridge(pdev);
1260         if (!tmp)
1261                 return ret;
1262         /* Secondary interface's bus number and devfn 0 */
1263         parent = pdev->bus->self;
1264         while (parent != tmp) {
1265                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1266                         parent->devfn);
1267                 if (!ret)
1268                         return ret;
1269                 parent = parent->bus->self;
1270         }
1271         if (tmp->is_pcie)
1272                 return device_context_mapped(domain->iommu,
1273                         tmp->subordinate->number, 0);
1274         else
1275                 return device_context_mapped(domain->iommu,
1276                         tmp->bus->number, tmp->devfn);
1277 }
1278
1279 static int
1280 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1281                         u64 hpa, size_t size, int prot)
1282 {
1283         u64 start_pfn, end_pfn;
1284         struct dma_pte *pte;
1285         int index;
1286
1287         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1288                 return -EINVAL;
1289         iova &= PAGE_MASK_4K;
1290         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1291         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1292         index = 0;
1293         while (start_pfn < end_pfn) {
1294                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1295                 if (!pte)
1296                         return -ENOMEM;
1297                 /* We don't need lock here, nobody else
1298                  * touches the iova range
1299                  */
1300                 BUG_ON(dma_pte_addr(*pte));
1301                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1302                 dma_set_pte_prot(*pte, prot);
1303                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1304                 start_pfn++;
1305                 index++;
1306         }
1307         return 0;
1308 }
1309
1310 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1311 {
1312         clear_context_table(domain->iommu, bus, devfn);
1313         iommu_flush_context_global(domain->iommu, 0);
1314         iommu_flush_iotlb_global(domain->iommu, 0);
1315 }
1316
1317 static void domain_remove_dev_info(struct dmar_domain *domain)
1318 {
1319         struct device_domain_info *info;
1320         unsigned long flags;
1321
1322         spin_lock_irqsave(&device_domain_lock, flags);
1323         while (!list_empty(&domain->devices)) {
1324                 info = list_entry(domain->devices.next,
1325                         struct device_domain_info, link);
1326                 list_del(&info->link);
1327                 list_del(&info->global);
1328                 if (info->dev)
1329                         info->dev->dev.archdata.iommu = NULL;
1330                 spin_unlock_irqrestore(&device_domain_lock, flags);
1331
1332                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1333                 free_devinfo_mem(info);
1334
1335                 spin_lock_irqsave(&device_domain_lock, flags);
1336         }
1337         spin_unlock_irqrestore(&device_domain_lock, flags);
1338 }
1339
1340 /*
1341  * find_domain
1342  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1343  */
1344 struct dmar_domain *
1345 find_domain(struct pci_dev *pdev)
1346 {
1347         struct device_domain_info *info;
1348
1349         /* No lock here, assumes no domain exit in normal case */
1350         info = pdev->dev.archdata.iommu;
1351         if (info)
1352                 return info->domain;
1353         return NULL;
1354 }
1355
1356 /* domain is initialized */
1357 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1358 {
1359         struct dmar_domain *domain, *found = NULL;
1360         struct intel_iommu *iommu;
1361         struct dmar_drhd_unit *drhd;
1362         struct device_domain_info *info, *tmp;
1363         struct pci_dev *dev_tmp;
1364         unsigned long flags;
1365         int bus = 0, devfn = 0;
1366
1367         domain = find_domain(pdev);
1368         if (domain)
1369                 return domain;
1370
1371         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1372         if (dev_tmp) {
1373                 if (dev_tmp->is_pcie) {
1374                         bus = dev_tmp->subordinate->number;
1375                         devfn = 0;
1376                 } else {
1377                         bus = dev_tmp->bus->number;
1378                         devfn = dev_tmp->devfn;
1379                 }
1380                 spin_lock_irqsave(&device_domain_lock, flags);
1381                 list_for_each_entry(info, &device_domain_list, global) {
1382                         if (info->bus == bus && info->devfn == devfn) {
1383                                 found = info->domain;
1384                                 break;
1385                         }
1386                 }
1387                 spin_unlock_irqrestore(&device_domain_lock, flags);
1388                 /* pcie-pci bridge already has a domain, uses it */
1389                 if (found) {
1390                         domain = found;
1391                         goto found_domain;
1392                 }
1393         }
1394
1395         /* Allocate new domain for the device */
1396         drhd = dmar_find_matched_drhd_unit(pdev);
1397         if (!drhd) {
1398                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1399                         pci_name(pdev));
1400                 return NULL;
1401         }
1402         iommu = drhd->iommu;
1403
1404         domain = iommu_alloc_domain(iommu);
1405         if (!domain)
1406                 goto error;
1407
1408         if (domain_init(domain, gaw)) {
1409                 domain_exit(domain);
1410                 goto error;
1411         }
1412
1413         /* register pcie-to-pci device */
1414         if (dev_tmp) {
1415                 info = alloc_devinfo_mem();
1416                 if (!info) {
1417                         domain_exit(domain);
1418                         goto error;
1419                 }
1420                 info->bus = bus;
1421                 info->devfn = devfn;
1422                 info->dev = NULL;
1423                 info->domain = domain;
1424                 /* This domain is shared by devices under p2p bridge */
1425                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1426
1427                 /* pcie-to-pci bridge already has a domain, uses it */
1428                 found = NULL;
1429                 spin_lock_irqsave(&device_domain_lock, flags);
1430                 list_for_each_entry(tmp, &device_domain_list, global) {
1431                         if (tmp->bus == bus && tmp->devfn == devfn) {
1432                                 found = tmp->domain;
1433                                 break;
1434                         }
1435                 }
1436                 if (found) {
1437                         free_devinfo_mem(info);
1438                         domain_exit(domain);
1439                         domain = found;
1440                 } else {
1441                         list_add(&info->link, &domain->devices);
1442                         list_add(&info->global, &device_domain_list);
1443                 }
1444                 spin_unlock_irqrestore(&device_domain_lock, flags);
1445         }
1446
1447 found_domain:
1448         info = alloc_devinfo_mem();
1449         if (!info)
1450                 goto error;
1451         info->bus = pdev->bus->number;
1452         info->devfn = pdev->devfn;
1453         info->dev = pdev;
1454         info->domain = domain;
1455         spin_lock_irqsave(&device_domain_lock, flags);
1456         /* somebody is fast */
1457         found = find_domain(pdev);
1458         if (found != NULL) {
1459                 spin_unlock_irqrestore(&device_domain_lock, flags);
1460                 if (found != domain) {
1461                         domain_exit(domain);
1462                         domain = found;
1463                 }
1464                 free_devinfo_mem(info);
1465                 return domain;
1466         }
1467         list_add(&info->link, &domain->devices);
1468         list_add(&info->global, &device_domain_list);
1469         pdev->dev.archdata.iommu = info;
1470         spin_unlock_irqrestore(&device_domain_lock, flags);
1471         return domain;
1472 error:
1473         /* recheck it here, maybe others set it */
1474         return find_domain(pdev);
1475 }
1476
1477 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1478 {
1479         struct dmar_domain *domain;
1480         unsigned long size;
1481         u64 base;
1482         int ret;
1483
1484         printk(KERN_INFO
1485                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1486                 pci_name(pdev), start, end);
1487         /* page table init */
1488         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1489         if (!domain)
1490                 return -ENOMEM;
1491
1492         /* The address might not be aligned */
1493         base = start & PAGE_MASK_4K;
1494         size = end - base;
1495         size = PAGE_ALIGN_4K(size);
1496         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1497                         IOVA_PFN(base + size) - 1)) {
1498                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1499                 ret = -ENOMEM;
1500                 goto error;
1501         }
1502
1503         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1504                 size, base, pci_name(pdev));
1505         /*
1506          * RMRR range might have overlap with physical memory range,
1507          * clear it first
1508          */
1509         dma_pte_clear_range(domain, base, base + size);
1510
1511         ret = domain_page_mapping(domain, base, base, size,
1512                 DMA_PTE_READ|DMA_PTE_WRITE);
1513         if (ret)
1514                 goto error;
1515
1516         /* context entry init */
1517         ret = domain_context_mapping(domain, pdev);
1518         if (!ret)
1519                 return 0;
1520 error:
1521         domain_exit(domain);
1522         return ret;
1523
1524 }
1525
1526 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1527         struct pci_dev *pdev)
1528 {
1529         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1530                 return 0;
1531         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1532                 rmrr->end_address + 1);
1533 }
1534
1535 #ifdef CONFIG_DMAR_GFX_WA
1536 struct iommu_prepare_data {
1537         struct pci_dev *pdev;
1538         int ret;
1539 };
1540
1541 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1542                                          unsigned long end_pfn, void *datax)
1543 {
1544         struct iommu_prepare_data *data;
1545
1546         data = (struct iommu_prepare_data *)datax;
1547
1548         data->ret = iommu_prepare_identity_map(data->pdev,
1549                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1550         return data->ret;
1551
1552 }
1553
1554 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1555 {
1556         int nid;
1557         struct iommu_prepare_data data;
1558
1559         data.pdev = pdev;
1560         data.ret = 0;
1561
1562         for_each_online_node(nid) {
1563                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1564                 if (data.ret)
1565                         return data.ret;
1566         }
1567         return data.ret;
1568 }
1569
1570 static void __init iommu_prepare_gfx_mapping(void)
1571 {
1572         struct pci_dev *pdev = NULL;
1573         int ret;
1574
1575         for_each_pci_dev(pdev) {
1576                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1577                                 !IS_GFX_DEVICE(pdev))
1578                         continue;
1579                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1580                         pci_name(pdev));
1581                 ret = iommu_prepare_with_active_regions(pdev);
1582                 if (ret)
1583                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1584         }
1585 }
1586 #endif
1587
1588 #ifdef CONFIG_DMAR_FLOPPY_WA
1589 static inline void iommu_prepare_isa(void)
1590 {
1591         struct pci_dev *pdev;
1592         int ret;
1593
1594         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1595         if (!pdev)
1596                 return;
1597
1598         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1599         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1600
1601         if (ret)
1602                 printk("IOMMU: Failed to create 0-64M identity map, "
1603                         "floppy might not work\n");
1604
1605 }
1606 #else
1607 static inline void iommu_prepare_isa(void)
1608 {
1609         return;
1610 }
1611 #endif /* !CONFIG_DMAR_FLPY_WA */
1612
1613 int __init init_dmars(void)
1614 {
1615         struct dmar_drhd_unit *drhd;
1616         struct dmar_rmrr_unit *rmrr;
1617         struct pci_dev *pdev;
1618         struct intel_iommu *iommu;
1619         int i, ret, unit = 0;
1620
1621         /*
1622          * for each drhd
1623          *    allocate root
1624          *    initialize and program root entry to not present
1625          * endfor
1626          */
1627         for_each_drhd_unit(drhd) {
1628                 g_num_of_iommus++;
1629                 /*
1630                  * lock not needed as this is only incremented in the single
1631                  * threaded kernel __init code path all other access are read
1632                  * only
1633                  */
1634         }
1635
1636         deferred_flush = kzalloc(g_num_of_iommus *
1637                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
1638         if (!deferred_flush) {
1639                 ret = -ENOMEM;
1640                 goto error;
1641         }
1642
1643         for_each_drhd_unit(drhd) {
1644                 if (drhd->ignored)
1645                         continue;
1646
1647                 iommu = drhd->iommu;
1648
1649                 ret = iommu_init_domains(iommu);
1650                 if (ret)
1651                         goto error;
1652
1653                 /*
1654                  * TBD:
1655                  * we could share the same root & context tables
1656                  * amoung all IOMMU's. Need to Split it later.
1657                  */
1658                 ret = iommu_alloc_root_entry(iommu);
1659                 if (ret) {
1660                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1661                         goto error;
1662                 }
1663         }
1664
1665         /*
1666          * For each rmrr
1667          *   for each dev attached to rmrr
1668          *   do
1669          *     locate drhd for dev, alloc domain for dev
1670          *     allocate free domain
1671          *     allocate page table entries for rmrr
1672          *     if context not allocated for bus
1673          *           allocate and init context
1674          *           set present in root table for this bus
1675          *     init context with domain, translation etc
1676          *    endfor
1677          * endfor
1678          */
1679         for_each_rmrr_units(rmrr) {
1680                 for (i = 0; i < rmrr->devices_cnt; i++) {
1681                         pdev = rmrr->devices[i];
1682                         /* some BIOS lists non-exist devices in DMAR table */
1683                         if (!pdev)
1684                                 continue;
1685                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1686                         if (ret)
1687                                 printk(KERN_ERR
1688                                  "IOMMU: mapping reserved region failed\n");
1689                 }
1690         }
1691
1692         iommu_prepare_gfx_mapping();
1693
1694         iommu_prepare_isa();
1695
1696         /*
1697          * for each drhd
1698          *   enable fault log
1699          *   global invalidate context cache
1700          *   global invalidate iotlb
1701          *   enable translation
1702          */
1703         for_each_drhd_unit(drhd) {
1704                 if (drhd->ignored)
1705                         continue;
1706                 iommu = drhd->iommu;
1707                 sprintf (iommu->name, "dmar%d", unit++);
1708
1709                 iommu_flush_write_buffer(iommu);
1710
1711                 ret = dmar_set_interrupt(iommu);
1712                 if (ret)
1713                         goto error;
1714
1715                 iommu_set_root_entry(iommu);
1716
1717                 iommu_flush_context_global(iommu, 0);
1718                 iommu_flush_iotlb_global(iommu, 0);
1719
1720                 iommu_disable_protect_mem_regions(iommu);
1721
1722                 ret = iommu_enable_translation(iommu);
1723                 if (ret)
1724                         goto error;
1725         }
1726
1727         return 0;
1728 error:
1729         for_each_drhd_unit(drhd) {
1730                 if (drhd->ignored)
1731                         continue;
1732                 iommu = drhd->iommu;
1733                 free_iommu(iommu);
1734         }
1735         return ret;
1736 }
1737
1738 static inline u64 aligned_size(u64 host_addr, size_t size)
1739 {
1740         u64 addr;
1741         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1742         return PAGE_ALIGN_4K(addr);
1743 }
1744
1745 struct iova *
1746 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1747 {
1748         struct iova *piova;
1749
1750         /* Make sure it's in range */
1751         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1752         if (!size || (IOVA_START_ADDR + size > end))
1753                 return NULL;
1754
1755         piova = alloc_iova(&domain->iovad,
1756                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1757         return piova;
1758 }
1759
1760 static struct iova *
1761 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1762                 size_t size)
1763 {
1764         struct pci_dev *pdev = to_pci_dev(dev);
1765         struct iova *iova = NULL;
1766
1767         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1768                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1769         } else  {
1770                 /*
1771                  * First try to allocate an io virtual address in
1772                  * DMA_32BIT_MASK and if that fails then try allocating
1773                  * from higher range
1774                  */
1775                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1776                 if (!iova)
1777                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1778         }
1779
1780         if (!iova) {
1781                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1782                 return NULL;
1783         }
1784
1785         return iova;
1786 }
1787
1788 static struct dmar_domain *
1789 get_valid_domain_for_dev(struct pci_dev *pdev)
1790 {
1791         struct dmar_domain *domain;
1792         int ret;
1793
1794         domain = get_domain_for_dev(pdev,
1795                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1796         if (!domain) {
1797                 printk(KERN_ERR
1798                         "Allocating domain for %s failed", pci_name(pdev));
1799                 return NULL;
1800         }
1801
1802         /* make sure context mapping is ok */
1803         if (unlikely(!domain_context_mapped(domain, pdev))) {
1804                 ret = domain_context_mapping(domain, pdev);
1805                 if (ret) {
1806                         printk(KERN_ERR
1807                                 "Domain context map for %s failed",
1808                                 pci_name(pdev));
1809                         return NULL;
1810                 }
1811         }
1812
1813         return domain;
1814 }
1815
1816 static dma_addr_t
1817 intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir)
1818 {
1819         struct pci_dev *pdev = to_pci_dev(hwdev);
1820         struct dmar_domain *domain;
1821         unsigned long start_paddr;
1822         struct iova *iova;
1823         int prot = 0;
1824         int ret;
1825
1826         BUG_ON(dir == DMA_NONE);
1827         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1828                 return paddr;
1829
1830         domain = get_valid_domain_for_dev(pdev);
1831         if (!domain)
1832                 return 0;
1833
1834         size = aligned_size((u64)paddr, size);
1835
1836         iova = __intel_alloc_iova(hwdev, domain, size);
1837         if (!iova)
1838                 goto error;
1839
1840         start_paddr = iova->pfn_lo << PAGE_SHIFT_4K;
1841
1842         /*
1843          * Check if DMAR supports zero-length reads on write only
1844          * mappings..
1845          */
1846         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1847                         !cap_zlr(domain->iommu->cap))
1848                 prot |= DMA_PTE_READ;
1849         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1850                 prot |= DMA_PTE_WRITE;
1851         /*
1852          * paddr - (paddr + size) might be partial page, we should map the whole
1853          * page.  Note: if two part of one page are separately mapped, we
1854          * might have two guest_addr mapping to the same host paddr, but this
1855          * is not a big problem
1856          */
1857         ret = domain_page_mapping(domain, start_paddr,
1858                 ((u64)paddr) & PAGE_MASK_4K, size, prot);
1859         if (ret)
1860                 goto error;
1861
1862         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1863                 pci_name(pdev), size, (u64)paddr,
1864                 size, (u64)start_paddr, dir);
1865
1866         /* it's a non-present to present mapping */
1867         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1868                         start_paddr, size >> PAGE_SHIFT_4K, 1);
1869         if (ret)
1870                 iommu_flush_write_buffer(domain->iommu);
1871
1872         return (start_paddr + ((u64)paddr & (~PAGE_MASK_4K)));
1873
1874 error:
1875         if (iova)
1876                 __free_iova(&domain->iovad, iova);
1877         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1878                 pci_name(pdev), size, (u64)paddr, dir);
1879         return 0;
1880 }
1881
1882 static void flush_unmaps(void)
1883 {
1884         int i, j;
1885
1886         timer_on = 0;
1887
1888         /* just flush them all */
1889         for (i = 0; i < g_num_of_iommus; i++) {
1890                 if (deferred_flush[i].next) {
1891                         struct intel_iommu *iommu =
1892                                 deferred_flush[i].domain[0]->iommu;
1893
1894                         iommu_flush_iotlb_global(iommu, 0);
1895                         for (j = 0; j < deferred_flush[i].next; j++) {
1896                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
1897                                                 deferred_flush[i].iova[j]);
1898                         }
1899                         deferred_flush[i].next = 0;
1900                 }
1901         }
1902
1903         list_size = 0;
1904 }
1905
1906 static void flush_unmaps_timeout(unsigned long data)
1907 {
1908         unsigned long flags;
1909
1910         spin_lock_irqsave(&async_umap_flush_lock, flags);
1911         flush_unmaps();
1912         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1913 }
1914
1915 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
1916 {
1917         unsigned long flags;
1918         int next, iommu_id;
1919
1920         spin_lock_irqsave(&async_umap_flush_lock, flags);
1921         if (list_size == HIGH_WATER_MARK)
1922                 flush_unmaps();
1923
1924         iommu_id = dom->iommu->seq_id;
1925
1926         next = deferred_flush[iommu_id].next;
1927         deferred_flush[iommu_id].domain[next] = dom;
1928         deferred_flush[iommu_id].iova[next] = iova;
1929         deferred_flush[iommu_id].next++;
1930
1931         if (!timer_on) {
1932                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
1933                 timer_on = 1;
1934         }
1935         list_size++;
1936         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
1937 }
1938
1939 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1940         size_t size, int dir)
1941 {
1942         struct pci_dev *pdev = to_pci_dev(dev);
1943         struct dmar_domain *domain;
1944         unsigned long start_addr;
1945         struct iova *iova;
1946
1947         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1948                 return;
1949         domain = find_domain(pdev);
1950         BUG_ON(!domain);
1951
1952         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1953         if (!iova)
1954                 return;
1955
1956         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1957         size = aligned_size((u64)dev_addr, size);
1958
1959         pr_debug("Device %s unmapping: %lx@%llx\n",
1960                 pci_name(pdev), size, (u64)start_addr);
1961
1962         /*  clear the whole page */
1963         dma_pte_clear_range(domain, start_addr, start_addr + size);
1964         /* free page tables */
1965         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1966         if (intel_iommu_strict) {
1967                 if (iommu_flush_iotlb_psi(domain->iommu,
1968                         domain->id, start_addr, size >> PAGE_SHIFT_4K, 0))
1969                         iommu_flush_write_buffer(domain->iommu);
1970                 /* free iova */
1971                 __free_iova(&domain->iovad, iova);
1972         } else {
1973                 add_unmap(domain, iova);
1974                 /*
1975                  * queue up the release of the unmap to save the 1/6th of the
1976                  * cpu used up by the iotlb flush operation...
1977                  */
1978         }
1979 }
1980
1981 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1982                        dma_addr_t *dma_handle, gfp_t flags)
1983 {
1984         void *vaddr;
1985         int order;
1986
1987         size = PAGE_ALIGN_4K(size);
1988         order = get_order(size);
1989         flags &= ~(GFP_DMA | GFP_DMA32);
1990
1991         vaddr = (void *)__get_free_pages(flags, order);
1992         if (!vaddr)
1993                 return NULL;
1994         memset(vaddr, 0, size);
1995
1996         *dma_handle = intel_map_single(hwdev, virt_to_bus(vaddr), size, DMA_BIDIRECTIONAL);
1997         if (*dma_handle)
1998                 return vaddr;
1999         free_pages((unsigned long)vaddr, order);
2000         return NULL;
2001 }
2002
2003 static void intel_free_coherent(struct device *hwdev, size_t size,
2004         void *vaddr, dma_addr_t dma_handle)
2005 {
2006         int order;
2007
2008         size = PAGE_ALIGN_4K(size);
2009         order = get_order(size);
2010
2011         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2012         free_pages((unsigned long)vaddr, order);
2013 }
2014
2015 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2016 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2017         int nelems, int dir)
2018 {
2019         int i;
2020         struct pci_dev *pdev = to_pci_dev(hwdev);
2021         struct dmar_domain *domain;
2022         unsigned long start_addr;
2023         struct iova *iova;
2024         size_t size = 0;
2025         void *addr;
2026         struct scatterlist *sg;
2027
2028         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2029                 return;
2030
2031         domain = find_domain(pdev);
2032
2033         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2034         if (!iova)
2035                 return;
2036         for_each_sg(sglist, sg, nelems, i) {
2037                 addr = SG_ENT_VIRT_ADDRESS(sg);
2038                 size += aligned_size((u64)addr, sg->length);
2039         }
2040
2041         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2042
2043         /*  clear the whole page */
2044         dma_pte_clear_range(domain, start_addr, start_addr + size);
2045         /* free page tables */
2046         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2047
2048         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2049                         size >> PAGE_SHIFT_4K, 0))
2050                 iommu_flush_write_buffer(domain->iommu);
2051
2052         /* free iova */
2053         __free_iova(&domain->iovad, iova);
2054 }
2055
2056 static int intel_nontranslate_map_sg(struct device *hddev,
2057         struct scatterlist *sglist, int nelems, int dir)
2058 {
2059         int i;
2060         struct scatterlist *sg;
2061
2062         for_each_sg(sglist, sg, nelems, i) {
2063                 BUG_ON(!sg_page(sg));
2064                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2065                 sg->dma_length = sg->length;
2066         }
2067         return nelems;
2068 }
2069
2070 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2071                                 int nelems, int dir)
2072 {
2073         void *addr;
2074         int i;
2075         struct pci_dev *pdev = to_pci_dev(hwdev);
2076         struct dmar_domain *domain;
2077         size_t size = 0;
2078         int prot = 0;
2079         size_t offset = 0;
2080         struct iova *iova = NULL;
2081         int ret;
2082         struct scatterlist *sg;
2083         unsigned long start_addr;
2084
2085         BUG_ON(dir == DMA_NONE);
2086         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2087                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2088
2089         domain = get_valid_domain_for_dev(pdev);
2090         if (!domain)
2091                 return 0;
2092
2093         for_each_sg(sglist, sg, nelems, i) {
2094                 addr = SG_ENT_VIRT_ADDRESS(sg);
2095                 addr = (void *)virt_to_phys(addr);
2096                 size += aligned_size((u64)addr, sg->length);
2097         }
2098
2099         iova = __intel_alloc_iova(hwdev, domain, size);
2100         if (!iova) {
2101                 sglist->dma_length = 0;
2102                 return 0;
2103         }
2104
2105         /*
2106          * Check if DMAR supports zero-length reads on write only
2107          * mappings..
2108          */
2109         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2110                         !cap_zlr(domain->iommu->cap))
2111                 prot |= DMA_PTE_READ;
2112         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2113                 prot |= DMA_PTE_WRITE;
2114
2115         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2116         offset = 0;
2117         for_each_sg(sglist, sg, nelems, i) {
2118                 addr = SG_ENT_VIRT_ADDRESS(sg);
2119                 addr = (void *)virt_to_phys(addr);
2120                 size = aligned_size((u64)addr, sg->length);
2121                 ret = domain_page_mapping(domain, start_addr + offset,
2122                         ((u64)addr) & PAGE_MASK_4K,
2123                         size, prot);
2124                 if (ret) {
2125                         /*  clear the page */
2126                         dma_pte_clear_range(domain, start_addr,
2127                                   start_addr + offset);
2128                         /* free page tables */
2129                         dma_pte_free_pagetable(domain, start_addr,
2130                                   start_addr + offset);
2131                         /* free iova */
2132                         __free_iova(&domain->iovad, iova);
2133                         return 0;
2134                 }
2135                 sg->dma_address = start_addr + offset +
2136                                 ((u64)addr & (~PAGE_MASK_4K));
2137                 sg->dma_length = sg->length;
2138                 offset += size;
2139         }
2140
2141         /* it's a non-present to present mapping */
2142         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2143                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2144                 iommu_flush_write_buffer(domain->iommu);
2145         return nelems;
2146 }
2147
2148 static struct dma_mapping_ops intel_dma_ops = {
2149         .alloc_coherent = intel_alloc_coherent,
2150         .free_coherent = intel_free_coherent,
2151         .map_single = intel_map_single,
2152         .unmap_single = intel_unmap_single,
2153         .map_sg = intel_map_sg,
2154         .unmap_sg = intel_unmap_sg,
2155 };
2156
2157 static inline int iommu_domain_cache_init(void)
2158 {
2159         int ret = 0;
2160
2161         iommu_domain_cache = kmem_cache_create("iommu_domain",
2162                                          sizeof(struct dmar_domain),
2163                                          0,
2164                                          SLAB_HWCACHE_ALIGN,
2165
2166                                          NULL);
2167         if (!iommu_domain_cache) {
2168                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2169                 ret = -ENOMEM;
2170         }
2171
2172         return ret;
2173 }
2174
2175 static inline int iommu_devinfo_cache_init(void)
2176 {
2177         int ret = 0;
2178
2179         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2180                                          sizeof(struct device_domain_info),
2181                                          0,
2182                                          SLAB_HWCACHE_ALIGN,
2183
2184                                          NULL);
2185         if (!iommu_devinfo_cache) {
2186                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2187                 ret = -ENOMEM;
2188         }
2189
2190         return ret;
2191 }
2192
2193 static inline int iommu_iova_cache_init(void)
2194 {
2195         int ret = 0;
2196
2197         iommu_iova_cache = kmem_cache_create("iommu_iova",
2198                                          sizeof(struct iova),
2199                                          0,
2200                                          SLAB_HWCACHE_ALIGN,
2201
2202                                          NULL);
2203         if (!iommu_iova_cache) {
2204                 printk(KERN_ERR "Couldn't create iova cache\n");
2205                 ret = -ENOMEM;
2206         }
2207
2208         return ret;
2209 }
2210
2211 static int __init iommu_init_mempool(void)
2212 {
2213         int ret;
2214         ret = iommu_iova_cache_init();
2215         if (ret)
2216                 return ret;
2217
2218         ret = iommu_domain_cache_init();
2219         if (ret)
2220                 goto domain_error;
2221
2222         ret = iommu_devinfo_cache_init();
2223         if (!ret)
2224                 return ret;
2225
2226         kmem_cache_destroy(iommu_domain_cache);
2227 domain_error:
2228         kmem_cache_destroy(iommu_iova_cache);
2229
2230         return -ENOMEM;
2231 }
2232
2233 static void __init iommu_exit_mempool(void)
2234 {
2235         kmem_cache_destroy(iommu_devinfo_cache);
2236         kmem_cache_destroy(iommu_domain_cache);
2237         kmem_cache_destroy(iommu_iova_cache);
2238
2239 }
2240
2241 static void __init init_no_remapping_devices(void)
2242 {
2243         struct dmar_drhd_unit *drhd;
2244
2245         for_each_drhd_unit(drhd) {
2246                 if (!drhd->include_all) {
2247                         int i;
2248                         for (i = 0; i < drhd->devices_cnt; i++)
2249                                 if (drhd->devices[i] != NULL)
2250                                         break;
2251                         /* ignore DMAR unit if no pci devices exist */
2252                         if (i == drhd->devices_cnt)
2253                                 drhd->ignored = 1;
2254                 }
2255         }
2256
2257         if (dmar_map_gfx)
2258                 return;
2259
2260         for_each_drhd_unit(drhd) {
2261                 int i;
2262                 if (drhd->ignored || drhd->include_all)
2263                         continue;
2264
2265                 for (i = 0; i < drhd->devices_cnt; i++)
2266                         if (drhd->devices[i] &&
2267                                 !IS_GFX_DEVICE(drhd->devices[i]))
2268                                 break;
2269
2270                 if (i < drhd->devices_cnt)
2271                         continue;
2272
2273                 /* bypass IOMMU if it is just for gfx devices */
2274                 drhd->ignored = 1;
2275                 for (i = 0; i < drhd->devices_cnt; i++) {
2276                         if (!drhd->devices[i])
2277                                 continue;
2278                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2279                 }
2280         }
2281 }
2282
2283 int __init intel_iommu_init(void)
2284 {
2285         int ret = 0;
2286
2287         if (dmar_table_init())
2288                 return  -ENODEV;
2289
2290         if (dmar_dev_scope_init())
2291                 return  -ENODEV;
2292
2293         /*
2294          * Check the need for DMA-remapping initialization now.
2295          * Above initialization will also be used by Interrupt-remapping.
2296          */
2297         if (no_iommu || swiotlb || dmar_disabled)
2298                 return -ENODEV;
2299
2300         iommu_init_mempool();
2301         dmar_init_reserved_ranges();
2302
2303         init_no_remapping_devices();
2304
2305         ret = init_dmars();
2306         if (ret) {
2307                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2308                 put_iova_domain(&reserved_iova_list);
2309                 iommu_exit_mempool();
2310                 return ret;
2311         }
2312         printk(KERN_INFO
2313         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2314
2315         init_timer(&unmap_timer);
2316         force_iommu = 1;
2317         dma_ops = &intel_dma_ops;
2318         return 0;
2319 }
2320