Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/sparc-2.6
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  */
22
23 #include <linux/init.h>
24 #include <linux/bitmap.h>
25 #include <linux/slab.h>
26 #include <linux/irq.h>
27 #include <linux/interrupt.h>
28 #include <linux/sysdev.h>
29 #include <linux/spinlock.h>
30 #include <linux/pci.h>
31 #include <linux/dmar.h>
32 #include <linux/dma-mapping.h>
33 #include <linux/mempool.h>
34 #include "iova.h"
35 #include "intel-iommu.h"
36 #include <asm/proto.h> /* force_iommu in this header in x86-64*/
37 #include <asm/cacheflush.h>
38 #include <asm/gart.h>
39 #include "pci.h"
40
41 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
42 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
43
44 #define IOAPIC_RANGE_START      (0xfee00000)
45 #define IOAPIC_RANGE_END        (0xfeefffff)
46 #define IOVA_START_ADDR         (0x1000)
47
48 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
49
50 #define DMAR_OPERATION_TIMEOUT (HZ*60) /* 1m */
51
52 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
53
54 static void domain_remove_dev_info(struct dmar_domain *domain);
55
56 static int dmar_disabled;
57 static int __initdata dmar_map_gfx = 1;
58 static int dmar_forcedac;
59
60 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
61 static DEFINE_SPINLOCK(device_domain_lock);
62 static LIST_HEAD(device_domain_list);
63
64 static int __init intel_iommu_setup(char *str)
65 {
66         if (!str)
67                 return -EINVAL;
68         while (*str) {
69                 if (!strncmp(str, "off", 3)) {
70                         dmar_disabled = 1;
71                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
72                 } else if (!strncmp(str, "igfx_off", 8)) {
73                         dmar_map_gfx = 0;
74                         printk(KERN_INFO
75                                 "Intel-IOMMU: disable GFX device mapping\n");
76                 } else if (!strncmp(str, "forcedac", 8)) {
77                         printk (KERN_INFO
78                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
79                         dmar_forcedac = 1;
80                 }
81
82                 str += strcspn(str, ",");
83                 while (*str == ',')
84                         str++;
85         }
86         return 0;
87 }
88 __setup("intel_iommu=", intel_iommu_setup);
89
90 static struct kmem_cache *iommu_domain_cache;
91 static struct kmem_cache *iommu_devinfo_cache;
92 static struct kmem_cache *iommu_iova_cache;
93
94 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
95 {
96         unsigned int flags;
97         void *vaddr;
98
99         /* trying to avoid low memory issues */
100         flags = current->flags & PF_MEMALLOC;
101         current->flags |= PF_MEMALLOC;
102         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
103         current->flags &= (~PF_MEMALLOC | flags);
104         return vaddr;
105 }
106
107
108 static inline void *alloc_pgtable_page(void)
109 {
110         unsigned int flags;
111         void *vaddr;
112
113         /* trying to avoid low memory issues */
114         flags = current->flags & PF_MEMALLOC;
115         current->flags |= PF_MEMALLOC;
116         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
117         current->flags &= (~PF_MEMALLOC | flags);
118         return vaddr;
119 }
120
121 static inline void free_pgtable_page(void *vaddr)
122 {
123         free_page((unsigned long)vaddr);
124 }
125
126 static inline void *alloc_domain_mem(void)
127 {
128         return iommu_kmem_cache_alloc(iommu_domain_cache);
129 }
130
131 static inline void free_domain_mem(void *vaddr)
132 {
133         kmem_cache_free(iommu_domain_cache, vaddr);
134 }
135
136 static inline void * alloc_devinfo_mem(void)
137 {
138         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
139 }
140
141 static inline void free_devinfo_mem(void *vaddr)
142 {
143         kmem_cache_free(iommu_devinfo_cache, vaddr);
144 }
145
146 struct iova *alloc_iova_mem(void)
147 {
148         return iommu_kmem_cache_alloc(iommu_iova_cache);
149 }
150
151 void free_iova_mem(struct iova *iova)
152 {
153         kmem_cache_free(iommu_iova_cache, iova);
154 }
155
156 static inline void __iommu_flush_cache(
157         struct intel_iommu *iommu, void *addr, int size)
158 {
159         if (!ecap_coherent(iommu->ecap))
160                 clflush_cache_range(addr, size);
161 }
162
163 /* Gets context entry for a given bus and devfn */
164 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
165                 u8 bus, u8 devfn)
166 {
167         struct root_entry *root;
168         struct context_entry *context;
169         unsigned long phy_addr;
170         unsigned long flags;
171
172         spin_lock_irqsave(&iommu->lock, flags);
173         root = &iommu->root_entry[bus];
174         context = get_context_addr_from_root(root);
175         if (!context) {
176                 context = (struct context_entry *)alloc_pgtable_page();
177                 if (!context) {
178                         spin_unlock_irqrestore(&iommu->lock, flags);
179                         return NULL;
180                 }
181                 __iommu_flush_cache(iommu, (void *)context, PAGE_SIZE_4K);
182                 phy_addr = virt_to_phys((void *)context);
183                 set_root_value(root, phy_addr);
184                 set_root_present(root);
185                 __iommu_flush_cache(iommu, root, sizeof(*root));
186         }
187         spin_unlock_irqrestore(&iommu->lock, flags);
188         return &context[devfn];
189 }
190
191 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
192 {
193         struct root_entry *root;
194         struct context_entry *context;
195         int ret;
196         unsigned long flags;
197
198         spin_lock_irqsave(&iommu->lock, flags);
199         root = &iommu->root_entry[bus];
200         context = get_context_addr_from_root(root);
201         if (!context) {
202                 ret = 0;
203                 goto out;
204         }
205         ret = context_present(context[devfn]);
206 out:
207         spin_unlock_irqrestore(&iommu->lock, flags);
208         return ret;
209 }
210
211 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
212 {
213         struct root_entry *root;
214         struct context_entry *context;
215         unsigned long flags;
216
217         spin_lock_irqsave(&iommu->lock, flags);
218         root = &iommu->root_entry[bus];
219         context = get_context_addr_from_root(root);
220         if (context) {
221                 context_clear_entry(context[devfn]);
222                 __iommu_flush_cache(iommu, &context[devfn], \
223                         sizeof(*context));
224         }
225         spin_unlock_irqrestore(&iommu->lock, flags);
226 }
227
228 static void free_context_table(struct intel_iommu *iommu)
229 {
230         struct root_entry *root;
231         int i;
232         unsigned long flags;
233         struct context_entry *context;
234
235         spin_lock_irqsave(&iommu->lock, flags);
236         if (!iommu->root_entry) {
237                 goto out;
238         }
239         for (i = 0; i < ROOT_ENTRY_NR; i++) {
240                 root = &iommu->root_entry[i];
241                 context = get_context_addr_from_root(root);
242                 if (context)
243                         free_pgtable_page(context);
244         }
245         free_pgtable_page(iommu->root_entry);
246         iommu->root_entry = NULL;
247 out:
248         spin_unlock_irqrestore(&iommu->lock, flags);
249 }
250
251 /* page table handling */
252 #define LEVEL_STRIDE            (9)
253 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
254
255 static inline int agaw_to_level(int agaw)
256 {
257         return agaw + 2;
258 }
259
260 static inline int agaw_to_width(int agaw)
261 {
262         return 30 + agaw * LEVEL_STRIDE;
263
264 }
265
266 static inline int width_to_agaw(int width)
267 {
268         return (width - 30) / LEVEL_STRIDE;
269 }
270
271 static inline unsigned int level_to_offset_bits(int level)
272 {
273         return (12 + (level - 1) * LEVEL_STRIDE);
274 }
275
276 static inline int address_level_offset(u64 addr, int level)
277 {
278         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
279 }
280
281 static inline u64 level_mask(int level)
282 {
283         return ((u64)-1 << level_to_offset_bits(level));
284 }
285
286 static inline u64 level_size(int level)
287 {
288         return ((u64)1 << level_to_offset_bits(level));
289 }
290
291 static inline u64 align_to_level(u64 addr, int level)
292 {
293         return ((addr + level_size(level) - 1) & level_mask(level));
294 }
295
296 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
297 {
298         int addr_width = agaw_to_width(domain->agaw);
299         struct dma_pte *parent, *pte = NULL;
300         int level = agaw_to_level(domain->agaw);
301         int offset;
302         unsigned long flags;
303
304         BUG_ON(!domain->pgd);
305
306         addr &= (((u64)1) << addr_width) - 1;
307         parent = domain->pgd;
308
309         spin_lock_irqsave(&domain->mapping_lock, flags);
310         while (level > 0) {
311                 void *tmp_page;
312
313                 offset = address_level_offset(addr, level);
314                 pte = &parent[offset];
315                 if (level == 1)
316                         break;
317
318                 if (!dma_pte_present(*pte)) {
319                         tmp_page = alloc_pgtable_page();
320
321                         if (!tmp_page) {
322                                 spin_unlock_irqrestore(&domain->mapping_lock,
323                                         flags);
324                                 return NULL;
325                         }
326                         __iommu_flush_cache(domain->iommu, tmp_page,
327                                         PAGE_SIZE_4K);
328                         dma_set_pte_addr(*pte, virt_to_phys(tmp_page));
329                         /*
330                          * high level table always sets r/w, last level page
331                          * table control read/write
332                          */
333                         dma_set_pte_readable(*pte);
334                         dma_set_pte_writable(*pte);
335                         __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
336                 }
337                 parent = phys_to_virt(dma_pte_addr(*pte));
338                 level--;
339         }
340
341         spin_unlock_irqrestore(&domain->mapping_lock, flags);
342         return pte;
343 }
344
345 /* return address's pte at specific level */
346 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
347                 int level)
348 {
349         struct dma_pte *parent, *pte = NULL;
350         int total = agaw_to_level(domain->agaw);
351         int offset;
352
353         parent = domain->pgd;
354         while (level <= total) {
355                 offset = address_level_offset(addr, total);
356                 pte = &parent[offset];
357                 if (level == total)
358                         return pte;
359
360                 if (!dma_pte_present(*pte))
361                         break;
362                 parent = phys_to_virt(dma_pte_addr(*pte));
363                 total--;
364         }
365         return NULL;
366 }
367
368 /* clear one page's page table */
369 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
370 {
371         struct dma_pte *pte = NULL;
372
373         /* get last level pte */
374         pte = dma_addr_level_pte(domain, addr, 1);
375
376         if (pte) {
377                 dma_clear_pte(*pte);
378                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
379         }
380 }
381
382 /* clear last level pte, a tlb flush should be followed */
383 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
384 {
385         int addr_width = agaw_to_width(domain->agaw);
386
387         start &= (((u64)1) << addr_width) - 1;
388         end &= (((u64)1) << addr_width) - 1;
389         /* in case it's partial page */
390         start = PAGE_ALIGN_4K(start);
391         end &= PAGE_MASK_4K;
392
393         /* we don't need lock here, nobody else touches the iova range */
394         while (start < end) {
395                 dma_pte_clear_one(domain, start);
396                 start += PAGE_SIZE_4K;
397         }
398 }
399
400 /* free page table pages. last level pte should already be cleared */
401 static void dma_pte_free_pagetable(struct dmar_domain *domain,
402         u64 start, u64 end)
403 {
404         int addr_width = agaw_to_width(domain->agaw);
405         struct dma_pte *pte;
406         int total = agaw_to_level(domain->agaw);
407         int level;
408         u64 tmp;
409
410         start &= (((u64)1) << addr_width) - 1;
411         end &= (((u64)1) << addr_width) - 1;
412
413         /* we don't need lock here, nobody else touches the iova range */
414         level = 2;
415         while (level <= total) {
416                 tmp = align_to_level(start, level);
417                 if (tmp >= end || (tmp + level_size(level) > end))
418                         return;
419
420                 while (tmp < end) {
421                         pte = dma_addr_level_pte(domain, tmp, level);
422                         if (pte) {
423                                 free_pgtable_page(
424                                         phys_to_virt(dma_pte_addr(*pte)));
425                                 dma_clear_pte(*pte);
426                                 __iommu_flush_cache(domain->iommu,
427                                                 pte, sizeof(*pte));
428                         }
429                         tmp += level_size(level);
430                 }
431                 level++;
432         }
433         /* free pgd */
434         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
435                 free_pgtable_page(domain->pgd);
436                 domain->pgd = NULL;
437         }
438 }
439
440 /* iommu handling */
441 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
442 {
443         struct root_entry *root;
444         unsigned long flags;
445
446         root = (struct root_entry *)alloc_pgtable_page();
447         if (!root)
448                 return -ENOMEM;
449
450         __iommu_flush_cache(iommu, root, PAGE_SIZE_4K);
451
452         spin_lock_irqsave(&iommu->lock, flags);
453         iommu->root_entry = root;
454         spin_unlock_irqrestore(&iommu->lock, flags);
455
456         return 0;
457 }
458
459 #define IOMMU_WAIT_OP(iommu, offset, op, cond, sts) \
460 {\
461         unsigned long start_time = jiffies;\
462         while (1) {\
463                 sts = op (iommu->reg + offset);\
464                 if (cond)\
465                         break;\
466                 if (time_after(jiffies, start_time + DMAR_OPERATION_TIMEOUT))\
467                         panic("DMAR hardware is malfunctioning\n");\
468                 cpu_relax();\
469         }\
470 }
471
472 static void iommu_set_root_entry(struct intel_iommu *iommu)
473 {
474         void *addr;
475         u32 cmd, sts;
476         unsigned long flag;
477
478         addr = iommu->root_entry;
479
480         spin_lock_irqsave(&iommu->register_lock, flag);
481         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
482
483         cmd = iommu->gcmd | DMA_GCMD_SRTP;
484         writel(cmd, iommu->reg + DMAR_GCMD_REG);
485
486         /* Make sure hardware complete it */
487         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
488                 readl, (sts & DMA_GSTS_RTPS), sts);
489
490         spin_unlock_irqrestore(&iommu->register_lock, flag);
491 }
492
493 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
494 {
495         u32 val;
496         unsigned long flag;
497
498         if (!cap_rwbf(iommu->cap))
499                 return;
500         val = iommu->gcmd | DMA_GCMD_WBF;
501
502         spin_lock_irqsave(&iommu->register_lock, flag);
503         writel(val, iommu->reg + DMAR_GCMD_REG);
504
505         /* Make sure hardware complete it */
506         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
507                         readl, (!(val & DMA_GSTS_WBFS)), val);
508
509         spin_unlock_irqrestore(&iommu->register_lock, flag);
510 }
511
512 /* return value determine if we need a write buffer flush */
513 static int __iommu_flush_context(struct intel_iommu *iommu,
514         u16 did, u16 source_id, u8 function_mask, u64 type,
515         int non_present_entry_flush)
516 {
517         u64 val = 0;
518         unsigned long flag;
519
520         /*
521          * In the non-present entry flush case, if hardware doesn't cache
522          * non-present entry we do nothing and if hardware cache non-present
523          * entry, we flush entries of domain 0 (the domain id is used to cache
524          * any non-present entries)
525          */
526         if (non_present_entry_flush) {
527                 if (!cap_caching_mode(iommu->cap))
528                         return 1;
529                 else
530                         did = 0;
531         }
532
533         switch (type) {
534         case DMA_CCMD_GLOBAL_INVL:
535                 val = DMA_CCMD_GLOBAL_INVL;
536                 break;
537         case DMA_CCMD_DOMAIN_INVL:
538                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
539                 break;
540         case DMA_CCMD_DEVICE_INVL:
541                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
542                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
543                 break;
544         default:
545                 BUG();
546         }
547         val |= DMA_CCMD_ICC;
548
549         spin_lock_irqsave(&iommu->register_lock, flag);
550         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
551
552         /* Make sure hardware complete it */
553         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
554                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
555
556         spin_unlock_irqrestore(&iommu->register_lock, flag);
557
558         /* flush context entry will implictly flush write buffer */
559         return 0;
560 }
561
562 static int inline iommu_flush_context_global(struct intel_iommu *iommu,
563         int non_present_entry_flush)
564 {
565         return __iommu_flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
566                 non_present_entry_flush);
567 }
568
569 static int inline iommu_flush_context_domain(struct intel_iommu *iommu, u16 did,
570         int non_present_entry_flush)
571 {
572         return __iommu_flush_context(iommu, did, 0, 0, DMA_CCMD_DOMAIN_INVL,
573                 non_present_entry_flush);
574 }
575
576 static int inline iommu_flush_context_device(struct intel_iommu *iommu,
577         u16 did, u16 source_id, u8 function_mask, int non_present_entry_flush)
578 {
579         return __iommu_flush_context(iommu, did, source_id, function_mask,
580                 DMA_CCMD_DEVICE_INVL, non_present_entry_flush);
581 }
582
583 /* return value determine if we need a write buffer flush */
584 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
585         u64 addr, unsigned int size_order, u64 type,
586         int non_present_entry_flush)
587 {
588         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
589         u64 val = 0, val_iva = 0;
590         unsigned long flag;
591
592         /*
593          * In the non-present entry flush case, if hardware doesn't cache
594          * non-present entry we do nothing and if hardware cache non-present
595          * entry, we flush entries of domain 0 (the domain id is used to cache
596          * any non-present entries)
597          */
598         if (non_present_entry_flush) {
599                 if (!cap_caching_mode(iommu->cap))
600                         return 1;
601                 else
602                         did = 0;
603         }
604
605         switch (type) {
606         case DMA_TLB_GLOBAL_FLUSH:
607                 /* global flush doesn't need set IVA_REG */
608                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
609                 break;
610         case DMA_TLB_DSI_FLUSH:
611                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
612                 break;
613         case DMA_TLB_PSI_FLUSH:
614                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
615                 /* Note: always flush non-leaf currently */
616                 val_iva = size_order | addr;
617                 break;
618         default:
619                 BUG();
620         }
621         /* Note: set drain read/write */
622 #if 0
623         /*
624          * This is probably to be super secure.. Looks like we can
625          * ignore it without any impact.
626          */
627         if (cap_read_drain(iommu->cap))
628                 val |= DMA_TLB_READ_DRAIN;
629 #endif
630         if (cap_write_drain(iommu->cap))
631                 val |= DMA_TLB_WRITE_DRAIN;
632
633         spin_lock_irqsave(&iommu->register_lock, flag);
634         /* Note: Only uses first TLB reg currently */
635         if (val_iva)
636                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
637         dmar_writeq(iommu->reg + tlb_offset + 8, val);
638
639         /* Make sure hardware complete it */
640         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
641                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
642
643         spin_unlock_irqrestore(&iommu->register_lock, flag);
644
645         /* check IOTLB invalidation granularity */
646         if (DMA_TLB_IAIG(val) == 0)
647                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
648         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
649                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
650                         DMA_TLB_IIRG(type), DMA_TLB_IAIG(val));
651         /* flush context entry will implictly flush write buffer */
652         return 0;
653 }
654
655 static int inline iommu_flush_iotlb_global(struct intel_iommu *iommu,
656         int non_present_entry_flush)
657 {
658         return __iommu_flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
659                 non_present_entry_flush);
660 }
661
662 static int inline iommu_flush_iotlb_dsi(struct intel_iommu *iommu, u16 did,
663         int non_present_entry_flush)
664 {
665         return __iommu_flush_iotlb(iommu, did, 0, 0, DMA_TLB_DSI_FLUSH,
666                 non_present_entry_flush);
667 }
668
669 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
670         u64 addr, unsigned int pages, int non_present_entry_flush)
671 {
672         unsigned int mask;
673
674         BUG_ON(addr & (~PAGE_MASK_4K));
675         BUG_ON(pages == 0);
676
677         /* Fallback to domain selective flush if no PSI support */
678         if (!cap_pgsel_inv(iommu->cap))
679                 return iommu_flush_iotlb_dsi(iommu, did,
680                         non_present_entry_flush);
681
682         /*
683          * PSI requires page size to be 2 ^ x, and the base address is naturally
684          * aligned to the size
685          */
686         mask = ilog2(__roundup_pow_of_two(pages));
687         /* Fallback to domain selective flush if size is too big */
688         if (mask > cap_max_amask_val(iommu->cap))
689                 return iommu_flush_iotlb_dsi(iommu, did,
690                         non_present_entry_flush);
691
692         return __iommu_flush_iotlb(iommu, did, addr, mask,
693                 DMA_TLB_PSI_FLUSH, non_present_entry_flush);
694 }
695
696 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
697 {
698         u32 pmen;
699         unsigned long flags;
700
701         spin_lock_irqsave(&iommu->register_lock, flags);
702         pmen = readl(iommu->reg + DMAR_PMEN_REG);
703         pmen &= ~DMA_PMEN_EPM;
704         writel(pmen, iommu->reg + DMAR_PMEN_REG);
705
706         /* wait for the protected region status bit to clear */
707         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
708                 readl, !(pmen & DMA_PMEN_PRS), pmen);
709
710         spin_unlock_irqrestore(&iommu->register_lock, flags);
711 }
712
713 static int iommu_enable_translation(struct intel_iommu *iommu)
714 {
715         u32 sts;
716         unsigned long flags;
717
718         spin_lock_irqsave(&iommu->register_lock, flags);
719         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
720
721         /* Make sure hardware complete it */
722         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
723                 readl, (sts & DMA_GSTS_TES), sts);
724
725         iommu->gcmd |= DMA_GCMD_TE;
726         spin_unlock_irqrestore(&iommu->register_lock, flags);
727         return 0;
728 }
729
730 static int iommu_disable_translation(struct intel_iommu *iommu)
731 {
732         u32 sts;
733         unsigned long flag;
734
735         spin_lock_irqsave(&iommu->register_lock, flag);
736         iommu->gcmd &= ~DMA_GCMD_TE;
737         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
738
739         /* Make sure hardware complete it */
740         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
741                 readl, (!(sts & DMA_GSTS_TES)), sts);
742
743         spin_unlock_irqrestore(&iommu->register_lock, flag);
744         return 0;
745 }
746
747 /* iommu interrupt handling. Most stuff are MSI-like. */
748
749 static const char *fault_reason_strings[] =
750 {
751         "Software",
752         "Present bit in root entry is clear",
753         "Present bit in context entry is clear",
754         "Invalid context entry",
755         "Access beyond MGAW",
756         "PTE Write access is not set",
757         "PTE Read access is not set",
758         "Next page table ptr is invalid",
759         "Root table address invalid",
760         "Context table ptr is invalid",
761         "non-zero reserved fields in RTP",
762         "non-zero reserved fields in CTP",
763         "non-zero reserved fields in PTE",
764 };
765 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
766
767 const char *dmar_get_fault_reason(u8 fault_reason)
768 {
769         if (fault_reason > MAX_FAULT_REASON_IDX)
770                 return "Unknown";
771         else
772                 return fault_reason_strings[fault_reason];
773 }
774
775 void dmar_msi_unmask(unsigned int irq)
776 {
777         struct intel_iommu *iommu = get_irq_data(irq);
778         unsigned long flag;
779
780         /* unmask it */
781         spin_lock_irqsave(&iommu->register_lock, flag);
782         writel(0, iommu->reg + DMAR_FECTL_REG);
783         /* Read a reg to force flush the post write */
784         readl(iommu->reg + DMAR_FECTL_REG);
785         spin_unlock_irqrestore(&iommu->register_lock, flag);
786 }
787
788 void dmar_msi_mask(unsigned int irq)
789 {
790         unsigned long flag;
791         struct intel_iommu *iommu = get_irq_data(irq);
792
793         /* mask it */
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
796         /* Read a reg to force flush the post write */
797         readl(iommu->reg + DMAR_FECTL_REG);
798         spin_unlock_irqrestore(&iommu->register_lock, flag);
799 }
800
801 void dmar_msi_write(int irq, struct msi_msg *msg)
802 {
803         struct intel_iommu *iommu = get_irq_data(irq);
804         unsigned long flag;
805
806         spin_lock_irqsave(&iommu->register_lock, flag);
807         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
808         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
809         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
810         spin_unlock_irqrestore(&iommu->register_lock, flag);
811 }
812
813 void dmar_msi_read(int irq, struct msi_msg *msg)
814 {
815         struct intel_iommu *iommu = get_irq_data(irq);
816         unsigned long flag;
817
818         spin_lock_irqsave(&iommu->register_lock, flag);
819         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
820         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
821         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
822         spin_unlock_irqrestore(&iommu->register_lock, flag);
823 }
824
825 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
826                 u8 fault_reason, u16 source_id, u64 addr)
827 {
828         const char *reason;
829
830         reason = dmar_get_fault_reason(fault_reason);
831
832         printk(KERN_ERR
833                 "DMAR:[%s] Request device [%02x:%02x.%d] "
834                 "fault addr %llx \n"
835                 "DMAR:[fault reason %02d] %s\n",
836                 (type ? "DMA Read" : "DMA Write"),
837                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
838                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
839         return 0;
840 }
841
842 #define PRIMARY_FAULT_REG_LEN (16)
843 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
844 {
845         struct intel_iommu *iommu = dev_id;
846         int reg, fault_index;
847         u32 fault_status;
848         unsigned long flag;
849
850         spin_lock_irqsave(&iommu->register_lock, flag);
851         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
852
853         /* TBD: ignore advanced fault log currently */
854         if (!(fault_status & DMA_FSTS_PPF))
855                 goto clear_overflow;
856
857         fault_index = dma_fsts_fault_record_index(fault_status);
858         reg = cap_fault_reg_offset(iommu->cap);
859         while (1) {
860                 u8 fault_reason;
861                 u16 source_id;
862                 u64 guest_addr;
863                 int type;
864                 u32 data;
865
866                 /* highest 32 bits */
867                 data = readl(iommu->reg + reg +
868                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
869                 if (!(data & DMA_FRCD_F))
870                         break;
871
872                 fault_reason = dma_frcd_fault_reason(data);
873                 type = dma_frcd_type(data);
874
875                 data = readl(iommu->reg + reg +
876                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
877                 source_id = dma_frcd_source_id(data);
878
879                 guest_addr = dmar_readq(iommu->reg + reg +
880                                 fault_index * PRIMARY_FAULT_REG_LEN);
881                 guest_addr = dma_frcd_page_addr(guest_addr);
882                 /* clear the fault */
883                 writel(DMA_FRCD_F, iommu->reg + reg +
884                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
885
886                 spin_unlock_irqrestore(&iommu->register_lock, flag);
887
888                 iommu_page_fault_do_one(iommu, type, fault_reason,
889                                 source_id, guest_addr);
890
891                 fault_index++;
892                 if (fault_index > cap_num_fault_regs(iommu->cap))
893                         fault_index = 0;
894                 spin_lock_irqsave(&iommu->register_lock, flag);
895         }
896 clear_overflow:
897         /* clear primary fault overflow */
898         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
899         if (fault_status & DMA_FSTS_PFO)
900                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
901
902         spin_unlock_irqrestore(&iommu->register_lock, flag);
903         return IRQ_HANDLED;
904 }
905
906 int dmar_set_interrupt(struct intel_iommu *iommu)
907 {
908         int irq, ret;
909
910         irq = create_irq();
911         if (!irq) {
912                 printk(KERN_ERR "IOMMU: no free vectors\n");
913                 return -EINVAL;
914         }
915
916         set_irq_data(irq, iommu);
917         iommu->irq = irq;
918
919         ret = arch_setup_dmar_msi(irq);
920         if (ret) {
921                 set_irq_data(irq, NULL);
922                 iommu->irq = 0;
923                 destroy_irq(irq);
924                 return 0;
925         }
926
927         /* Force fault register is cleared */
928         iommu_page_fault(irq, iommu);
929
930         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
931         if (ret)
932                 printk(KERN_ERR "IOMMU: can't request irq\n");
933         return ret;
934 }
935
936 static int iommu_init_domains(struct intel_iommu *iommu)
937 {
938         unsigned long ndomains;
939         unsigned long nlongs;
940
941         ndomains = cap_ndoms(iommu->cap);
942         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
943         nlongs = BITS_TO_LONGS(ndomains);
944
945         /* TBD: there might be 64K domains,
946          * consider other allocation for future chip
947          */
948         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
949         if (!iommu->domain_ids) {
950                 printk(KERN_ERR "Allocating domain id array failed\n");
951                 return -ENOMEM;
952         }
953         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
954                         GFP_KERNEL);
955         if (!iommu->domains) {
956                 printk(KERN_ERR "Allocating domain array failed\n");
957                 kfree(iommu->domain_ids);
958                 return -ENOMEM;
959         }
960
961         /*
962          * if Caching mode is set, then invalid translations are tagged
963          * with domainid 0. Hence we need to pre-allocate it.
964          */
965         if (cap_caching_mode(iommu->cap))
966                 set_bit(0, iommu->domain_ids);
967         return 0;
968 }
969
970 static struct intel_iommu *alloc_iommu(struct dmar_drhd_unit *drhd)
971 {
972         struct intel_iommu *iommu;
973         int ret;
974         int map_size;
975         u32 ver;
976
977         iommu = kzalloc(sizeof(*iommu), GFP_KERNEL);
978         if (!iommu)
979                 return NULL;
980         iommu->reg = ioremap(drhd->reg_base_addr, PAGE_SIZE_4K);
981         if (!iommu->reg) {
982                 printk(KERN_ERR "IOMMU: can't map the region\n");
983                 goto error;
984         }
985         iommu->cap = dmar_readq(iommu->reg + DMAR_CAP_REG);
986         iommu->ecap = dmar_readq(iommu->reg + DMAR_ECAP_REG);
987
988         /* the registers might be more than one page */
989         map_size = max_t(int, ecap_max_iotlb_offset(iommu->ecap),
990                 cap_max_fault_reg_offset(iommu->cap));
991         map_size = PAGE_ALIGN_4K(map_size);
992         if (map_size > PAGE_SIZE_4K) {
993                 iounmap(iommu->reg);
994                 iommu->reg = ioremap(drhd->reg_base_addr, map_size);
995                 if (!iommu->reg) {
996                         printk(KERN_ERR "IOMMU: can't map the region\n");
997                         goto error;
998                 }
999         }
1000
1001         ver = readl(iommu->reg + DMAR_VER_REG);
1002         pr_debug("IOMMU %llx: ver %d:%d cap %llx ecap %llx\n",
1003                 drhd->reg_base_addr, DMAR_VER_MAJOR(ver), DMAR_VER_MINOR(ver),
1004                 iommu->cap, iommu->ecap);
1005         ret = iommu_init_domains(iommu);
1006         if (ret)
1007                 goto error_unmap;
1008         spin_lock_init(&iommu->lock);
1009         spin_lock_init(&iommu->register_lock);
1010
1011         drhd->iommu = iommu;
1012         return iommu;
1013 error_unmap:
1014         iounmap(iommu->reg);
1015 error:
1016         kfree(iommu);
1017         return NULL;
1018 }
1019
1020 static void domain_exit(struct dmar_domain *domain);
1021 static void free_iommu(struct intel_iommu *iommu)
1022 {
1023         struct dmar_domain *domain;
1024         int i;
1025
1026         if (!iommu)
1027                 return;
1028
1029         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1030         for (; i < cap_ndoms(iommu->cap); ) {
1031                 domain = iommu->domains[i];
1032                 clear_bit(i, iommu->domain_ids);
1033                 domain_exit(domain);
1034                 i = find_next_bit(iommu->domain_ids,
1035                         cap_ndoms(iommu->cap), i+1);
1036         }
1037
1038         if (iommu->gcmd & DMA_GCMD_TE)
1039                 iommu_disable_translation(iommu);
1040
1041         if (iommu->irq) {
1042                 set_irq_data(iommu->irq, NULL);
1043                 /* This will mask the irq */
1044                 free_irq(iommu->irq, iommu);
1045                 destroy_irq(iommu->irq);
1046         }
1047
1048         kfree(iommu->domains);
1049         kfree(iommu->domain_ids);
1050
1051         /* free context mapping */
1052         free_context_table(iommu);
1053
1054         if (iommu->reg)
1055                 iounmap(iommu->reg);
1056         kfree(iommu);
1057 }
1058
1059 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1060 {
1061         unsigned long num;
1062         unsigned long ndomains;
1063         struct dmar_domain *domain;
1064         unsigned long flags;
1065
1066         domain = alloc_domain_mem();
1067         if (!domain)
1068                 return NULL;
1069
1070         ndomains = cap_ndoms(iommu->cap);
1071
1072         spin_lock_irqsave(&iommu->lock, flags);
1073         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1074         if (num >= ndomains) {
1075                 spin_unlock_irqrestore(&iommu->lock, flags);
1076                 free_domain_mem(domain);
1077                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1078                 return NULL;
1079         }
1080
1081         set_bit(num, iommu->domain_ids);
1082         domain->id = num;
1083         domain->iommu = iommu;
1084         iommu->domains[num] = domain;
1085         spin_unlock_irqrestore(&iommu->lock, flags);
1086
1087         return domain;
1088 }
1089
1090 static void iommu_free_domain(struct dmar_domain *domain)
1091 {
1092         unsigned long flags;
1093
1094         spin_lock_irqsave(&domain->iommu->lock, flags);
1095         clear_bit(domain->id, domain->iommu->domain_ids);
1096         spin_unlock_irqrestore(&domain->iommu->lock, flags);
1097 }
1098
1099 static struct iova_domain reserved_iova_list;
1100
1101 static void dmar_init_reserved_ranges(void)
1102 {
1103         struct pci_dev *pdev = NULL;
1104         struct iova *iova;
1105         int i;
1106         u64 addr, size;
1107
1108         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1109
1110         /* IOAPIC ranges shouldn't be accessed by DMA */
1111         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1112                 IOVA_PFN(IOAPIC_RANGE_END));
1113         if (!iova)
1114                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1115
1116         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1117         for_each_pci_dev(pdev) {
1118                 struct resource *r;
1119
1120                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1121                         r = &pdev->resource[i];
1122                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1123                                 continue;
1124                         addr = r->start;
1125                         addr &= PAGE_MASK_4K;
1126                         size = r->end - addr;
1127                         size = PAGE_ALIGN_4K(size);
1128                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1129                                 IOVA_PFN(size + addr) - 1);
1130                         if (!iova)
1131                                 printk(KERN_ERR "Reserve iova failed\n");
1132                 }
1133         }
1134
1135 }
1136
1137 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1138 {
1139         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1140 }
1141
1142 static inline int guestwidth_to_adjustwidth(int gaw)
1143 {
1144         int agaw;
1145         int r = (gaw - 12) % 9;
1146
1147         if (r == 0)
1148                 agaw = gaw;
1149         else
1150                 agaw = gaw + 9 - r;
1151         if (agaw > 64)
1152                 agaw = 64;
1153         return agaw;
1154 }
1155
1156 static int domain_init(struct dmar_domain *domain, int guest_width)
1157 {
1158         struct intel_iommu *iommu;
1159         int adjust_width, agaw;
1160         unsigned long sagaw;
1161
1162         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1163         spin_lock_init(&domain->mapping_lock);
1164
1165         domain_reserve_special_ranges(domain);
1166
1167         /* calculate AGAW */
1168         iommu = domain->iommu;
1169         if (guest_width > cap_mgaw(iommu->cap))
1170                 guest_width = cap_mgaw(iommu->cap);
1171         domain->gaw = guest_width;
1172         adjust_width = guestwidth_to_adjustwidth(guest_width);
1173         agaw = width_to_agaw(adjust_width);
1174         sagaw = cap_sagaw(iommu->cap);
1175         if (!test_bit(agaw, &sagaw)) {
1176                 /* hardware doesn't support it, choose a bigger one */
1177                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1178                 agaw = find_next_bit(&sagaw, 5, agaw);
1179                 if (agaw >= 5)
1180                         return -ENODEV;
1181         }
1182         domain->agaw = agaw;
1183         INIT_LIST_HEAD(&domain->devices);
1184
1185         /* always allocate the top pgd */
1186         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1187         if (!domain->pgd)
1188                 return -ENOMEM;
1189         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE_4K);
1190         return 0;
1191 }
1192
1193 static void domain_exit(struct dmar_domain *domain)
1194 {
1195         u64 end;
1196
1197         /* Domain 0 is reserved, so dont process it */
1198         if (!domain)
1199                 return;
1200
1201         domain_remove_dev_info(domain);
1202         /* destroy iovas */
1203         put_iova_domain(&domain->iovad);
1204         end = DOMAIN_MAX_ADDR(domain->gaw);
1205         end = end & (~PAGE_MASK_4K);
1206
1207         /* clear ptes */
1208         dma_pte_clear_range(domain, 0, end);
1209
1210         /* free page tables */
1211         dma_pte_free_pagetable(domain, 0, end);
1212
1213         iommu_free_domain(domain);
1214         free_domain_mem(domain);
1215 }
1216
1217 static int domain_context_mapping_one(struct dmar_domain *domain,
1218                 u8 bus, u8 devfn)
1219 {
1220         struct context_entry *context;
1221         struct intel_iommu *iommu = domain->iommu;
1222         unsigned long flags;
1223
1224         pr_debug("Set context mapping for %02x:%02x.%d\n",
1225                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1226         BUG_ON(!domain->pgd);
1227         context = device_to_context_entry(iommu, bus, devfn);
1228         if (!context)
1229                 return -ENOMEM;
1230         spin_lock_irqsave(&iommu->lock, flags);
1231         if (context_present(*context)) {
1232                 spin_unlock_irqrestore(&iommu->lock, flags);
1233                 return 0;
1234         }
1235
1236         context_set_domain_id(*context, domain->id);
1237         context_set_address_width(*context, domain->agaw);
1238         context_set_address_root(*context, virt_to_phys(domain->pgd));
1239         context_set_translation_type(*context, CONTEXT_TT_MULTI_LEVEL);
1240         context_set_fault_enable(*context);
1241         context_set_present(*context);
1242         __iommu_flush_cache(iommu, context, sizeof(*context));
1243
1244         /* it's a non-present to present mapping */
1245         if (iommu_flush_context_device(iommu, domain->id,
1246                         (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, 1))
1247                 iommu_flush_write_buffer(iommu);
1248         else
1249                 iommu_flush_iotlb_dsi(iommu, 0, 0);
1250         spin_unlock_irqrestore(&iommu->lock, flags);
1251         return 0;
1252 }
1253
1254 static int
1255 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1256 {
1257         int ret;
1258         struct pci_dev *tmp, *parent;
1259
1260         ret = domain_context_mapping_one(domain, pdev->bus->number,
1261                 pdev->devfn);
1262         if (ret)
1263                 return ret;
1264
1265         /* dependent device mapping */
1266         tmp = pci_find_upstream_pcie_bridge(pdev);
1267         if (!tmp)
1268                 return 0;
1269         /* Secondary interface's bus number and devfn 0 */
1270         parent = pdev->bus->self;
1271         while (parent != tmp) {
1272                 ret = domain_context_mapping_one(domain, parent->bus->number,
1273                         parent->devfn);
1274                 if (ret)
1275                         return ret;
1276                 parent = parent->bus->self;
1277         }
1278         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1279                 return domain_context_mapping_one(domain,
1280                         tmp->subordinate->number, 0);
1281         else /* this is a legacy PCI bridge */
1282                 return domain_context_mapping_one(domain,
1283                         tmp->bus->number, tmp->devfn);
1284 }
1285
1286 static int domain_context_mapped(struct dmar_domain *domain,
1287         struct pci_dev *pdev)
1288 {
1289         int ret;
1290         struct pci_dev *tmp, *parent;
1291
1292         ret = device_context_mapped(domain->iommu,
1293                 pdev->bus->number, pdev->devfn);
1294         if (!ret)
1295                 return ret;
1296         /* dependent device mapping */
1297         tmp = pci_find_upstream_pcie_bridge(pdev);
1298         if (!tmp)
1299                 return ret;
1300         /* Secondary interface's bus number and devfn 0 */
1301         parent = pdev->bus->self;
1302         while (parent != tmp) {
1303                 ret = device_context_mapped(domain->iommu, parent->bus->number,
1304                         parent->devfn);
1305                 if (!ret)
1306                         return ret;
1307                 parent = parent->bus->self;
1308         }
1309         if (tmp->is_pcie)
1310                 return device_context_mapped(domain->iommu,
1311                         tmp->subordinate->number, 0);
1312         else
1313                 return device_context_mapped(domain->iommu,
1314                         tmp->bus->number, tmp->devfn);
1315 }
1316
1317 static int
1318 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1319                         u64 hpa, size_t size, int prot)
1320 {
1321         u64 start_pfn, end_pfn;
1322         struct dma_pte *pte;
1323         int index;
1324
1325         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1326                 return -EINVAL;
1327         iova &= PAGE_MASK_4K;
1328         start_pfn = ((u64)hpa) >> PAGE_SHIFT_4K;
1329         end_pfn = (PAGE_ALIGN_4K(((u64)hpa) + size)) >> PAGE_SHIFT_4K;
1330         index = 0;
1331         while (start_pfn < end_pfn) {
1332                 pte = addr_to_dma_pte(domain, iova + PAGE_SIZE_4K * index);
1333                 if (!pte)
1334                         return -ENOMEM;
1335                 /* We don't need lock here, nobody else
1336                  * touches the iova range
1337                  */
1338                 BUG_ON(dma_pte_addr(*pte));
1339                 dma_set_pte_addr(*pte, start_pfn << PAGE_SHIFT_4K);
1340                 dma_set_pte_prot(*pte, prot);
1341                 __iommu_flush_cache(domain->iommu, pte, sizeof(*pte));
1342                 start_pfn++;
1343                 index++;
1344         }
1345         return 0;
1346 }
1347
1348 static void detach_domain_for_dev(struct dmar_domain *domain, u8 bus, u8 devfn)
1349 {
1350         clear_context_table(domain->iommu, bus, devfn);
1351         iommu_flush_context_global(domain->iommu, 0);
1352         iommu_flush_iotlb_global(domain->iommu, 0);
1353 }
1354
1355 static void domain_remove_dev_info(struct dmar_domain *domain)
1356 {
1357         struct device_domain_info *info;
1358         unsigned long flags;
1359
1360         spin_lock_irqsave(&device_domain_lock, flags);
1361         while (!list_empty(&domain->devices)) {
1362                 info = list_entry(domain->devices.next,
1363                         struct device_domain_info, link);
1364                 list_del(&info->link);
1365                 list_del(&info->global);
1366                 if (info->dev)
1367                         info->dev->dev.archdata.iommu = NULL;
1368                 spin_unlock_irqrestore(&device_domain_lock, flags);
1369
1370                 detach_domain_for_dev(info->domain, info->bus, info->devfn);
1371                 free_devinfo_mem(info);
1372
1373                 spin_lock_irqsave(&device_domain_lock, flags);
1374         }
1375         spin_unlock_irqrestore(&device_domain_lock, flags);
1376 }
1377
1378 /*
1379  * find_domain
1380  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1381  */
1382 struct dmar_domain *
1383 find_domain(struct pci_dev *pdev)
1384 {
1385         struct device_domain_info *info;
1386
1387         /* No lock here, assumes no domain exit in normal case */
1388         info = pdev->dev.archdata.iommu;
1389         if (info)
1390                 return info->domain;
1391         return NULL;
1392 }
1393
1394 static int dmar_pci_device_match(struct pci_dev *devices[], int cnt,
1395      struct pci_dev *dev)
1396 {
1397         int index;
1398
1399         while (dev) {
1400                 for (index = 0; index < cnt; index ++)
1401                         if (dev == devices[index])
1402                                 return 1;
1403
1404                 /* Check our parent */
1405                 dev = dev->bus->self;
1406         }
1407
1408         return 0;
1409 }
1410
1411 static struct dmar_drhd_unit *
1412 dmar_find_matched_drhd_unit(struct pci_dev *dev)
1413 {
1414         struct dmar_drhd_unit *drhd = NULL;
1415
1416         list_for_each_entry(drhd, &dmar_drhd_units, list) {
1417                 if (drhd->include_all || dmar_pci_device_match(drhd->devices,
1418                                                 drhd->devices_cnt, dev))
1419                         return drhd;
1420         }
1421
1422         return NULL;
1423 }
1424
1425 /* domain is initialized */
1426 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1427 {
1428         struct dmar_domain *domain, *found = NULL;
1429         struct intel_iommu *iommu;
1430         struct dmar_drhd_unit *drhd;
1431         struct device_domain_info *info, *tmp;
1432         struct pci_dev *dev_tmp;
1433         unsigned long flags;
1434         int bus = 0, devfn = 0;
1435
1436         domain = find_domain(pdev);
1437         if (domain)
1438                 return domain;
1439
1440         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1441         if (dev_tmp) {
1442                 if (dev_tmp->is_pcie) {
1443                         bus = dev_tmp->subordinate->number;
1444                         devfn = 0;
1445                 } else {
1446                         bus = dev_tmp->bus->number;
1447                         devfn = dev_tmp->devfn;
1448                 }
1449                 spin_lock_irqsave(&device_domain_lock, flags);
1450                 list_for_each_entry(info, &device_domain_list, global) {
1451                         if (info->bus == bus && info->devfn == devfn) {
1452                                 found = info->domain;
1453                                 break;
1454                         }
1455                 }
1456                 spin_unlock_irqrestore(&device_domain_lock, flags);
1457                 /* pcie-pci bridge already has a domain, uses it */
1458                 if (found) {
1459                         domain = found;
1460                         goto found_domain;
1461                 }
1462         }
1463
1464         /* Allocate new domain for the device */
1465         drhd = dmar_find_matched_drhd_unit(pdev);
1466         if (!drhd) {
1467                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1468                         pci_name(pdev));
1469                 return NULL;
1470         }
1471         iommu = drhd->iommu;
1472
1473         domain = iommu_alloc_domain(iommu);
1474         if (!domain)
1475                 goto error;
1476
1477         if (domain_init(domain, gaw)) {
1478                 domain_exit(domain);
1479                 goto error;
1480         }
1481
1482         /* register pcie-to-pci device */
1483         if (dev_tmp) {
1484                 info = alloc_devinfo_mem();
1485                 if (!info) {
1486                         domain_exit(domain);
1487                         goto error;
1488                 }
1489                 info->bus = bus;
1490                 info->devfn = devfn;
1491                 info->dev = NULL;
1492                 info->domain = domain;
1493                 /* This domain is shared by devices under p2p bridge */
1494                 domain->flags |= DOMAIN_FLAG_MULTIPLE_DEVICES;
1495
1496                 /* pcie-to-pci bridge already has a domain, uses it */
1497                 found = NULL;
1498                 spin_lock_irqsave(&device_domain_lock, flags);
1499                 list_for_each_entry(tmp, &device_domain_list, global) {
1500                         if (tmp->bus == bus && tmp->devfn == devfn) {
1501                                 found = tmp->domain;
1502                                 break;
1503                         }
1504                 }
1505                 if (found) {
1506                         free_devinfo_mem(info);
1507                         domain_exit(domain);
1508                         domain = found;
1509                 } else {
1510                         list_add(&info->link, &domain->devices);
1511                         list_add(&info->global, &device_domain_list);
1512                 }
1513                 spin_unlock_irqrestore(&device_domain_lock, flags);
1514         }
1515
1516 found_domain:
1517         info = alloc_devinfo_mem();
1518         if (!info)
1519                 goto error;
1520         info->bus = pdev->bus->number;
1521         info->devfn = pdev->devfn;
1522         info->dev = pdev;
1523         info->domain = domain;
1524         spin_lock_irqsave(&device_domain_lock, flags);
1525         /* somebody is fast */
1526         found = find_domain(pdev);
1527         if (found != NULL) {
1528                 spin_unlock_irqrestore(&device_domain_lock, flags);
1529                 if (found != domain) {
1530                         domain_exit(domain);
1531                         domain = found;
1532                 }
1533                 free_devinfo_mem(info);
1534                 return domain;
1535         }
1536         list_add(&info->link, &domain->devices);
1537         list_add(&info->global, &device_domain_list);
1538         pdev->dev.archdata.iommu = info;
1539         spin_unlock_irqrestore(&device_domain_lock, flags);
1540         return domain;
1541 error:
1542         /* recheck it here, maybe others set it */
1543         return find_domain(pdev);
1544 }
1545
1546 static int iommu_prepare_identity_map(struct pci_dev *pdev, u64 start, u64 end)
1547 {
1548         struct dmar_domain *domain;
1549         unsigned long size;
1550         u64 base;
1551         int ret;
1552
1553         printk(KERN_INFO
1554                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1555                 pci_name(pdev), start, end);
1556         /* page table init */
1557         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1558         if (!domain)
1559                 return -ENOMEM;
1560
1561         /* The address might not be aligned */
1562         base = start & PAGE_MASK_4K;
1563         size = end - base;
1564         size = PAGE_ALIGN_4K(size);
1565         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1566                         IOVA_PFN(base + size) - 1)) {
1567                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1568                 ret = -ENOMEM;
1569                 goto error;
1570         }
1571
1572         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1573                 size, base, pci_name(pdev));
1574         /*
1575          * RMRR range might have overlap with physical memory range,
1576          * clear it first
1577          */
1578         dma_pte_clear_range(domain, base, base + size);
1579
1580         ret = domain_page_mapping(domain, base, base, size,
1581                 DMA_PTE_READ|DMA_PTE_WRITE);
1582         if (ret)
1583                 goto error;
1584
1585         /* context entry init */
1586         ret = domain_context_mapping(domain, pdev);
1587         if (!ret)
1588                 return 0;
1589 error:
1590         domain_exit(domain);
1591         return ret;
1592
1593 }
1594
1595 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1596         struct pci_dev *pdev)
1597 {
1598         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1599                 return 0;
1600         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1601                 rmrr->end_address + 1);
1602 }
1603
1604 #ifdef CONFIG_DMAR_GFX_WA
1605 extern int arch_get_ram_range(int slot, u64 *addr, u64 *size);
1606 static void __init iommu_prepare_gfx_mapping(void)
1607 {
1608         struct pci_dev *pdev = NULL;
1609         u64 base, size;
1610         int slot;
1611         int ret;
1612
1613         for_each_pci_dev(pdev) {
1614                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1615                                 !IS_GFX_DEVICE(pdev))
1616                         continue;
1617                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1618                         pci_name(pdev));
1619                 slot = arch_get_ram_range(0, &base, &size);
1620                 while (slot >= 0) {
1621                         ret = iommu_prepare_identity_map(pdev,
1622                                         base, base + size);
1623                         if (ret)
1624                                 goto error;
1625                         slot = arch_get_ram_range(slot, &base, &size);
1626                 }
1627                 continue;
1628 error:
1629                 printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1630         }
1631 }
1632 #endif
1633
1634 #ifdef CONFIG_DMAR_FLOPPY_WA
1635 static inline void iommu_prepare_isa(void)
1636 {
1637         struct pci_dev *pdev;
1638         int ret;
1639
1640         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1641         if (!pdev)
1642                 return;
1643
1644         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1645         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1646
1647         if (ret)
1648                 printk("IOMMU: Failed to create 0-64M identity map, "
1649                         "floppy might not work\n");
1650
1651 }
1652 #else
1653 static inline void iommu_prepare_isa(void)
1654 {
1655         return;
1656 }
1657 #endif /* !CONFIG_DMAR_FLPY_WA */
1658
1659 int __init init_dmars(void)
1660 {
1661         struct dmar_drhd_unit *drhd;
1662         struct dmar_rmrr_unit *rmrr;
1663         struct pci_dev *pdev;
1664         struct intel_iommu *iommu;
1665         int ret, unit = 0;
1666
1667         /*
1668          * for each drhd
1669          *    allocate root
1670          *    initialize and program root entry to not present
1671          * endfor
1672          */
1673         for_each_drhd_unit(drhd) {
1674                 if (drhd->ignored)
1675                         continue;
1676                 iommu = alloc_iommu(drhd);
1677                 if (!iommu) {
1678                         ret = -ENOMEM;
1679                         goto error;
1680                 }
1681
1682                 /*
1683                  * TBD:
1684                  * we could share the same root & context tables
1685                  * amoung all IOMMU's. Need to Split it later.
1686                  */
1687                 ret = iommu_alloc_root_entry(iommu);
1688                 if (ret) {
1689                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
1690                         goto error;
1691                 }
1692         }
1693
1694         /*
1695          * For each rmrr
1696          *   for each dev attached to rmrr
1697          *   do
1698          *     locate drhd for dev, alloc domain for dev
1699          *     allocate free domain
1700          *     allocate page table entries for rmrr
1701          *     if context not allocated for bus
1702          *           allocate and init context
1703          *           set present in root table for this bus
1704          *     init context with domain, translation etc
1705          *    endfor
1706          * endfor
1707          */
1708         for_each_rmrr_units(rmrr) {
1709                 int i;
1710                 for (i = 0; i < rmrr->devices_cnt; i++) {
1711                         pdev = rmrr->devices[i];
1712                         /* some BIOS lists non-exist devices in DMAR table */
1713                         if (!pdev)
1714                                 continue;
1715                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
1716                         if (ret)
1717                                 printk(KERN_ERR
1718                                  "IOMMU: mapping reserved region failed\n");
1719                 }
1720         }
1721
1722         iommu_prepare_gfx_mapping();
1723
1724         iommu_prepare_isa();
1725
1726         /*
1727          * for each drhd
1728          *   enable fault log
1729          *   global invalidate context cache
1730          *   global invalidate iotlb
1731          *   enable translation
1732          */
1733         for_each_drhd_unit(drhd) {
1734                 if (drhd->ignored)
1735                         continue;
1736                 iommu = drhd->iommu;
1737                 sprintf (iommu->name, "dmar%d", unit++);
1738
1739                 iommu_flush_write_buffer(iommu);
1740
1741                 ret = dmar_set_interrupt(iommu);
1742                 if (ret)
1743                         goto error;
1744
1745                 iommu_set_root_entry(iommu);
1746
1747                 iommu_flush_context_global(iommu, 0);
1748                 iommu_flush_iotlb_global(iommu, 0);
1749
1750                 iommu_disable_protect_mem_regions(iommu);
1751
1752                 ret = iommu_enable_translation(iommu);
1753                 if (ret)
1754                         goto error;
1755         }
1756
1757         return 0;
1758 error:
1759         for_each_drhd_unit(drhd) {
1760                 if (drhd->ignored)
1761                         continue;
1762                 iommu = drhd->iommu;
1763                 free_iommu(iommu);
1764         }
1765         return ret;
1766 }
1767
1768 static inline u64 aligned_size(u64 host_addr, size_t size)
1769 {
1770         u64 addr;
1771         addr = (host_addr & (~PAGE_MASK_4K)) + size;
1772         return PAGE_ALIGN_4K(addr);
1773 }
1774
1775 struct iova *
1776 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
1777 {
1778         struct iova *piova;
1779
1780         /* Make sure it's in range */
1781         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
1782         if (!size || (IOVA_START_ADDR + size > end))
1783                 return NULL;
1784
1785         piova = alloc_iova(&domain->iovad,
1786                         size >> PAGE_SHIFT_4K, IOVA_PFN(end), 1);
1787         return piova;
1788 }
1789
1790 static struct iova *
1791 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
1792                 size_t size)
1793 {
1794         struct pci_dev *pdev = to_pci_dev(dev);
1795         struct iova *iova = NULL;
1796
1797         if ((pdev->dma_mask <= DMA_32BIT_MASK) || (dmar_forcedac)) {
1798                 iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1799         } else  {
1800                 /*
1801                  * First try to allocate an io virtual address in
1802                  * DMA_32BIT_MASK and if that fails then try allocating
1803                  * from higher range
1804                  */
1805                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
1806                 if (!iova)
1807                         iova = iommu_alloc_iova(domain, size, pdev->dma_mask);
1808         }
1809
1810         if (!iova) {
1811                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
1812                 return NULL;
1813         }
1814
1815         return iova;
1816 }
1817
1818 static struct dmar_domain *
1819 get_valid_domain_for_dev(struct pci_dev *pdev)
1820 {
1821         struct dmar_domain *domain;
1822         int ret;
1823
1824         domain = get_domain_for_dev(pdev,
1825                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
1826         if (!domain) {
1827                 printk(KERN_ERR
1828                         "Allocating domain for %s failed", pci_name(pdev));
1829                 return NULL;
1830         }
1831
1832         /* make sure context mapping is ok */
1833         if (unlikely(!domain_context_mapped(domain, pdev))) {
1834                 ret = domain_context_mapping(domain, pdev);
1835                 if (ret) {
1836                         printk(KERN_ERR
1837                                 "Domain context map for %s failed",
1838                                 pci_name(pdev));
1839                         return NULL;
1840                 }
1841         }
1842
1843         return domain;
1844 }
1845
1846 static dma_addr_t intel_map_single(struct device *hwdev, void *addr,
1847         size_t size, int dir)
1848 {
1849         struct pci_dev *pdev = to_pci_dev(hwdev);
1850         int ret;
1851         struct dmar_domain *domain;
1852         unsigned long start_addr;
1853         struct iova *iova;
1854         int prot = 0;
1855
1856         BUG_ON(dir == DMA_NONE);
1857         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1858                 return virt_to_bus(addr);
1859
1860         domain = get_valid_domain_for_dev(pdev);
1861         if (!domain)
1862                 return 0;
1863
1864         addr = (void *)virt_to_phys(addr);
1865         size = aligned_size((u64)addr, size);
1866
1867         iova = __intel_alloc_iova(hwdev, domain, size);
1868         if (!iova)
1869                 goto error;
1870
1871         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1872
1873         /*
1874          * Check if DMAR supports zero-length reads on write only
1875          * mappings..
1876          */
1877         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
1878                         !cap_zlr(domain->iommu->cap))
1879                 prot |= DMA_PTE_READ;
1880         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
1881                 prot |= DMA_PTE_WRITE;
1882         /*
1883          * addr - (addr + size) might be partial page, we should map the whole
1884          * page.  Note: if two part of one page are separately mapped, we
1885          * might have two guest_addr mapping to the same host addr, but this
1886          * is not a big problem
1887          */
1888         ret = domain_page_mapping(domain, start_addr,
1889                 ((u64)addr) & PAGE_MASK_4K, size, prot);
1890         if (ret)
1891                 goto error;
1892
1893         pr_debug("Device %s request: %lx@%llx mapping: %lx@%llx, dir %d\n",
1894                 pci_name(pdev), size, (u64)addr,
1895                 size, (u64)start_addr, dir);
1896
1897         /* it's a non-present to present mapping */
1898         ret = iommu_flush_iotlb_psi(domain->iommu, domain->id,
1899                         start_addr, size >> PAGE_SHIFT_4K, 1);
1900         if (ret)
1901                 iommu_flush_write_buffer(domain->iommu);
1902
1903         return (start_addr + ((u64)addr & (~PAGE_MASK_4K)));
1904
1905 error:
1906         if (iova)
1907                 __free_iova(&domain->iovad, iova);
1908         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
1909                 pci_name(pdev), size, (u64)addr, dir);
1910         return 0;
1911 }
1912
1913 static void intel_unmap_single(struct device *dev, dma_addr_t dev_addr,
1914         size_t size, int dir)
1915 {
1916         struct pci_dev *pdev = to_pci_dev(dev);
1917         struct dmar_domain *domain;
1918         unsigned long start_addr;
1919         struct iova *iova;
1920
1921         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1922                 return;
1923         domain = find_domain(pdev);
1924         BUG_ON(!domain);
1925
1926         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
1927         if (!iova)
1928                 return;
1929
1930         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
1931         size = aligned_size((u64)dev_addr, size);
1932
1933         pr_debug("Device %s unmapping: %lx@%llx\n",
1934                 pci_name(pdev), size, (u64)start_addr);
1935
1936         /*  clear the whole page */
1937         dma_pte_clear_range(domain, start_addr, start_addr + size);
1938         /* free page tables */
1939         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
1940
1941         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
1942                         size >> PAGE_SHIFT_4K, 0))
1943                 iommu_flush_write_buffer(domain->iommu);
1944
1945         /* free iova */
1946         __free_iova(&domain->iovad, iova);
1947 }
1948
1949 static void * intel_alloc_coherent(struct device *hwdev, size_t size,
1950                        dma_addr_t *dma_handle, gfp_t flags)
1951 {
1952         void *vaddr;
1953         int order;
1954
1955         size = PAGE_ALIGN_4K(size);
1956         order = get_order(size);
1957         flags &= ~(GFP_DMA | GFP_DMA32);
1958
1959         vaddr = (void *)__get_free_pages(flags, order);
1960         if (!vaddr)
1961                 return NULL;
1962         memset(vaddr, 0, size);
1963
1964         *dma_handle = intel_map_single(hwdev, vaddr, size, DMA_BIDIRECTIONAL);
1965         if (*dma_handle)
1966                 return vaddr;
1967         free_pages((unsigned long)vaddr, order);
1968         return NULL;
1969 }
1970
1971 static void intel_free_coherent(struct device *hwdev, size_t size,
1972         void *vaddr, dma_addr_t dma_handle)
1973 {
1974         int order;
1975
1976         size = PAGE_ALIGN_4K(size);
1977         order = get_order(size);
1978
1979         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
1980         free_pages((unsigned long)vaddr, order);
1981 }
1982
1983 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
1984 static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
1985         int nelems, int dir)
1986 {
1987         int i;
1988         struct pci_dev *pdev = to_pci_dev(hwdev);
1989         struct dmar_domain *domain;
1990         unsigned long start_addr;
1991         struct iova *iova;
1992         size_t size = 0;
1993         void *addr;
1994         struct scatterlist *sg;
1995
1996         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1997                 return;
1998
1999         domain = find_domain(pdev);
2000
2001         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2002         if (!iova)
2003                 return;
2004         for_each_sg(sglist, sg, nelems, i) {
2005                 addr = SG_ENT_VIRT_ADDRESS(sg);
2006                 size += aligned_size((u64)addr, sg->length);
2007         }
2008
2009         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2010
2011         /*  clear the whole page */
2012         dma_pte_clear_range(domain, start_addr, start_addr + size);
2013         /* free page tables */
2014         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2015
2016         if (iommu_flush_iotlb_psi(domain->iommu, domain->id, start_addr,
2017                         size >> PAGE_SHIFT_4K, 0))
2018                 iommu_flush_write_buffer(domain->iommu);
2019
2020         /* free iova */
2021         __free_iova(&domain->iovad, iova);
2022 }
2023
2024 static int intel_nontranslate_map_sg(struct device *hddev,
2025         struct scatterlist *sglist, int nelems, int dir)
2026 {
2027         int i;
2028         struct scatterlist *sg;
2029
2030         for_each_sg(sglist, sg, nelems, i) {
2031                 BUG_ON(!sg_page(sg));
2032                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2033                 sg->dma_length = sg->length;
2034         }
2035         return nelems;
2036 }
2037
2038 static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist,
2039                                 int nelems, int dir)
2040 {
2041         void *addr;
2042         int i;
2043         struct pci_dev *pdev = to_pci_dev(hwdev);
2044         struct dmar_domain *domain;
2045         size_t size = 0;
2046         int prot = 0;
2047         size_t offset = 0;
2048         struct iova *iova = NULL;
2049         int ret;
2050         struct scatterlist *sg;
2051         unsigned long start_addr;
2052
2053         BUG_ON(dir == DMA_NONE);
2054         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2055                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2056
2057         domain = get_valid_domain_for_dev(pdev);
2058         if (!domain)
2059                 return 0;
2060
2061         for_each_sg(sglist, sg, nelems, i) {
2062                 addr = SG_ENT_VIRT_ADDRESS(sg);
2063                 addr = (void *)virt_to_phys(addr);
2064                 size += aligned_size((u64)addr, sg->length);
2065         }
2066
2067         iova = __intel_alloc_iova(hwdev, domain, size);
2068         if (!iova) {
2069                 sglist->dma_length = 0;
2070                 return 0;
2071         }
2072
2073         /*
2074          * Check if DMAR supports zero-length reads on write only
2075          * mappings..
2076          */
2077         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2078                         !cap_zlr(domain->iommu->cap))
2079                 prot |= DMA_PTE_READ;
2080         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2081                 prot |= DMA_PTE_WRITE;
2082
2083         start_addr = iova->pfn_lo << PAGE_SHIFT_4K;
2084         offset = 0;
2085         for_each_sg(sglist, sg, nelems, i) {
2086                 addr = SG_ENT_VIRT_ADDRESS(sg);
2087                 addr = (void *)virt_to_phys(addr);
2088                 size = aligned_size((u64)addr, sg->length);
2089                 ret = domain_page_mapping(domain, start_addr + offset,
2090                         ((u64)addr) & PAGE_MASK_4K,
2091                         size, prot);
2092                 if (ret) {
2093                         /*  clear the page */
2094                         dma_pte_clear_range(domain, start_addr,
2095                                   start_addr + offset);
2096                         /* free page tables */
2097                         dma_pte_free_pagetable(domain, start_addr,
2098                                   start_addr + offset);
2099                         /* free iova */
2100                         __free_iova(&domain->iovad, iova);
2101                         return 0;
2102                 }
2103                 sg->dma_address = start_addr + offset +
2104                                 ((u64)addr & (~PAGE_MASK_4K));
2105                 sg->dma_length = sg->length;
2106                 offset += size;
2107         }
2108
2109         /* it's a non-present to present mapping */
2110         if (iommu_flush_iotlb_psi(domain->iommu, domain->id,
2111                         start_addr, offset >> PAGE_SHIFT_4K, 1))
2112                 iommu_flush_write_buffer(domain->iommu);
2113         return nelems;
2114 }
2115
2116 static struct dma_mapping_ops intel_dma_ops = {
2117         .alloc_coherent = intel_alloc_coherent,
2118         .free_coherent = intel_free_coherent,
2119         .map_single = intel_map_single,
2120         .unmap_single = intel_unmap_single,
2121         .map_sg = intel_map_sg,
2122         .unmap_sg = intel_unmap_sg,
2123 };
2124
2125 static inline int iommu_domain_cache_init(void)
2126 {
2127         int ret = 0;
2128
2129         iommu_domain_cache = kmem_cache_create("iommu_domain",
2130                                          sizeof(struct dmar_domain),
2131                                          0,
2132                                          SLAB_HWCACHE_ALIGN,
2133
2134                                          NULL);
2135         if (!iommu_domain_cache) {
2136                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2137                 ret = -ENOMEM;
2138         }
2139
2140         return ret;
2141 }
2142
2143 static inline int iommu_devinfo_cache_init(void)
2144 {
2145         int ret = 0;
2146
2147         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2148                                          sizeof(struct device_domain_info),
2149                                          0,
2150                                          SLAB_HWCACHE_ALIGN,
2151
2152                                          NULL);
2153         if (!iommu_devinfo_cache) {
2154                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2155                 ret = -ENOMEM;
2156         }
2157
2158         return ret;
2159 }
2160
2161 static inline int iommu_iova_cache_init(void)
2162 {
2163         int ret = 0;
2164
2165         iommu_iova_cache = kmem_cache_create("iommu_iova",
2166                                          sizeof(struct iova),
2167                                          0,
2168                                          SLAB_HWCACHE_ALIGN,
2169
2170                                          NULL);
2171         if (!iommu_iova_cache) {
2172                 printk(KERN_ERR "Couldn't create iova cache\n");
2173                 ret = -ENOMEM;
2174         }
2175
2176         return ret;
2177 }
2178
2179 static int __init iommu_init_mempool(void)
2180 {
2181         int ret;
2182         ret = iommu_iova_cache_init();
2183         if (ret)
2184                 return ret;
2185
2186         ret = iommu_domain_cache_init();
2187         if (ret)
2188                 goto domain_error;
2189
2190         ret = iommu_devinfo_cache_init();
2191         if (!ret)
2192                 return ret;
2193
2194         kmem_cache_destroy(iommu_domain_cache);
2195 domain_error:
2196         kmem_cache_destroy(iommu_iova_cache);
2197
2198         return -ENOMEM;
2199 }
2200
2201 static void __init iommu_exit_mempool(void)
2202 {
2203         kmem_cache_destroy(iommu_devinfo_cache);
2204         kmem_cache_destroy(iommu_domain_cache);
2205         kmem_cache_destroy(iommu_iova_cache);
2206
2207 }
2208
2209 void __init detect_intel_iommu(void)
2210 {
2211         if (swiotlb || no_iommu || iommu_detected || dmar_disabled)
2212                 return;
2213         if (early_dmar_detect()) {
2214                 iommu_detected = 1;
2215         }
2216 }
2217
2218 static void __init init_no_remapping_devices(void)
2219 {
2220         struct dmar_drhd_unit *drhd;
2221
2222         for_each_drhd_unit(drhd) {
2223                 if (!drhd->include_all) {
2224                         int i;
2225                         for (i = 0; i < drhd->devices_cnt; i++)
2226                                 if (drhd->devices[i] != NULL)
2227                                         break;
2228                         /* ignore DMAR unit if no pci devices exist */
2229                         if (i == drhd->devices_cnt)
2230                                 drhd->ignored = 1;
2231                 }
2232         }
2233
2234         if (dmar_map_gfx)
2235                 return;
2236
2237         for_each_drhd_unit(drhd) {
2238                 int i;
2239                 if (drhd->ignored || drhd->include_all)
2240                         continue;
2241
2242                 for (i = 0; i < drhd->devices_cnt; i++)
2243                         if (drhd->devices[i] &&
2244                                 !IS_GFX_DEVICE(drhd->devices[i]))
2245                                 break;
2246
2247                 if (i < drhd->devices_cnt)
2248                         continue;
2249
2250                 /* bypass IOMMU if it is just for gfx devices */
2251                 drhd->ignored = 1;
2252                 for (i = 0; i < drhd->devices_cnt; i++) {
2253                         if (!drhd->devices[i])
2254                                 continue;
2255                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2256                 }
2257         }
2258 }
2259
2260 int __init intel_iommu_init(void)
2261 {
2262         int ret = 0;
2263
2264         if (no_iommu || swiotlb || dmar_disabled)
2265                 return -ENODEV;
2266
2267         if (dmar_table_init())
2268                 return  -ENODEV;
2269
2270         iommu_init_mempool();
2271         dmar_init_reserved_ranges();
2272
2273         init_no_remapping_devices();
2274
2275         ret = init_dmars();
2276         if (ret) {
2277                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2278                 put_iova_domain(&reserved_iova_list);
2279                 iommu_exit_mempool();
2280                 return ret;
2281         }
2282         printk(KERN_INFO
2283         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2284
2285         force_iommu = 1;
2286         dma_ops = &intel_dma_ops;
2287         return 0;
2288 }
2289