Merge branch 'topic/cs423x-merge' into for-linus
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 static int rwbf_quirk;
65
66 /*
67  * 0: Present
68  * 1-11: Reserved
69  * 12-63: Context Ptr (12 - (haw-1))
70  * 64-127: Reserved
71  */
72 struct root_entry {
73         u64     val;
74         u64     rsvd1;
75 };
76 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
77 static inline bool root_present(struct root_entry *root)
78 {
79         return (root->val & 1);
80 }
81 static inline void set_root_present(struct root_entry *root)
82 {
83         root->val |= 1;
84 }
85 static inline void set_root_value(struct root_entry *root, unsigned long value)
86 {
87         root->val |= value & VTD_PAGE_MASK;
88 }
89
90 static inline struct context_entry *
91 get_context_addr_from_root(struct root_entry *root)
92 {
93         return (struct context_entry *)
94                 (root_present(root)?phys_to_virt(
95                 root->val & VTD_PAGE_MASK) :
96                 NULL);
97 }
98
99 /*
100  * low 64 bits:
101  * 0: present
102  * 1: fault processing disable
103  * 2-3: translation type
104  * 12-63: address space root
105  * high 64 bits:
106  * 0-2: address width
107  * 3-6: aval
108  * 8-23: domain id
109  */
110 struct context_entry {
111         u64 lo;
112         u64 hi;
113 };
114
115 static inline bool context_present(struct context_entry *context)
116 {
117         return (context->lo & 1);
118 }
119 static inline void context_set_present(struct context_entry *context)
120 {
121         context->lo |= 1;
122 }
123
124 static inline void context_set_fault_enable(struct context_entry *context)
125 {
126         context->lo &= (((u64)-1) << 2) | 1;
127 }
128
129 #define CONTEXT_TT_MULTI_LEVEL 0
130
131 static inline void context_set_translation_type(struct context_entry *context,
132                                                 unsigned long value)
133 {
134         context->lo &= (((u64)-1) << 4) | 3;
135         context->lo |= (value & 3) << 2;
136 }
137
138 static inline void context_set_address_root(struct context_entry *context,
139                                             unsigned long value)
140 {
141         context->lo |= value & VTD_PAGE_MASK;
142 }
143
144 static inline void context_set_address_width(struct context_entry *context,
145                                              unsigned long value)
146 {
147         context->hi |= value & 7;
148 }
149
150 static inline void context_set_domain_id(struct context_entry *context,
151                                          unsigned long value)
152 {
153         context->hi |= (value & ((1 << 16) - 1)) << 8;
154 }
155
156 static inline void context_clear_entry(struct context_entry *context)
157 {
158         context->lo = 0;
159         context->hi = 0;
160 }
161
162 /*
163  * 0: readable
164  * 1: writable
165  * 2-6: reserved
166  * 7: super page
167  * 8-11: available
168  * 12-63: Host physcial address
169  */
170 struct dma_pte {
171         u64 val;
172 };
173
174 static inline void dma_clear_pte(struct dma_pte *pte)
175 {
176         pte->val = 0;
177 }
178
179 static inline void dma_set_pte_readable(struct dma_pte *pte)
180 {
181         pte->val |= DMA_PTE_READ;
182 }
183
184 static inline void dma_set_pte_writable(struct dma_pte *pte)
185 {
186         pte->val |= DMA_PTE_WRITE;
187 }
188
189 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
190 {
191         pte->val = (pte->val & ~3) | (prot & 3);
192 }
193
194 static inline u64 dma_pte_addr(struct dma_pte *pte)
195 {
196         return (pte->val & VTD_PAGE_MASK);
197 }
198
199 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
200 {
201         pte->val |= (addr & VTD_PAGE_MASK);
202 }
203
204 static inline bool dma_pte_present(struct dma_pte *pte)
205 {
206         return (pte->val & 3) != 0;
207 }
208
209 /* devices under the same p2p bridge are owned in one domain */
210 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
211
212 /* domain represents a virtual machine, more than one devices
213  * across iommus may be owned in one domain, e.g. kvm guest.
214  */
215 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
216
217 struct dmar_domain {
218         int     id;                     /* domain id */
219         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
220
221         struct list_head devices;       /* all devices' list */
222         struct iova_domain iovad;       /* iova's that belong to this domain */
223
224         struct dma_pte  *pgd;           /* virtual address */
225         spinlock_t      mapping_lock;   /* page table lock */
226         int             gaw;            /* max guest address width */
227
228         /* adjusted guest address width, 0 is level 2 30-bit */
229         int             agaw;
230
231         int             flags;          /* flags to find out type of domain */
232
233         int             iommu_coherency;/* indicate coherency of iommu access */
234         int             iommu_count;    /* reference count of iommu */
235         spinlock_t      iommu_lock;     /* protect iommu set in domain */
236         u64             max_addr;       /* maximum mapped address */
237 };
238
239 /* PCI domain-device relationship */
240 struct device_domain_info {
241         struct list_head link;  /* link to domain siblings */
242         struct list_head global; /* link to global list */
243         u8 bus;                 /* PCI bus numer */
244         u8 devfn;               /* PCI devfn number */
245         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
246         struct dmar_domain *domain; /* pointer to domain */
247 };
248
249 static void flush_unmaps_timeout(unsigned long data);
250
251 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
252
253 #define HIGH_WATER_MARK 250
254 struct deferred_flush_tables {
255         int next;
256         struct iova *iova[HIGH_WATER_MARK];
257         struct dmar_domain *domain[HIGH_WATER_MARK];
258 };
259
260 static struct deferred_flush_tables *deferred_flush;
261
262 /* bitmap for indexing intel_iommus */
263 static int g_num_of_iommus;
264
265 static DEFINE_SPINLOCK(async_umap_flush_lock);
266 static LIST_HEAD(unmaps_to_do);
267
268 static int timer_on;
269 static long list_size;
270
271 static void domain_remove_dev_info(struct dmar_domain *domain);
272
273 #ifdef CONFIG_DMAR_DEFAULT_ON
274 int dmar_disabled = 0;
275 #else
276 int dmar_disabled = 1;
277 #endif /*CONFIG_DMAR_DEFAULT_ON*/
278
279 static int __initdata dmar_map_gfx = 1;
280 static int dmar_forcedac;
281 static int intel_iommu_strict;
282
283 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
284 static DEFINE_SPINLOCK(device_domain_lock);
285 static LIST_HEAD(device_domain_list);
286
287 static struct iommu_ops intel_iommu_ops;
288
289 static int __init intel_iommu_setup(char *str)
290 {
291         if (!str)
292                 return -EINVAL;
293         while (*str) {
294                 if (!strncmp(str, "on", 2)) {
295                         dmar_disabled = 0;
296                         printk(KERN_INFO "Intel-IOMMU: enabled\n");
297                 } else if (!strncmp(str, "off", 3)) {
298                         dmar_disabled = 1;
299                         printk(KERN_INFO "Intel-IOMMU: disabled\n");
300                 } else if (!strncmp(str, "igfx_off", 8)) {
301                         dmar_map_gfx = 0;
302                         printk(KERN_INFO
303                                 "Intel-IOMMU: disable GFX device mapping\n");
304                 } else if (!strncmp(str, "forcedac", 8)) {
305                         printk(KERN_INFO
306                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
307                         dmar_forcedac = 1;
308                 } else if (!strncmp(str, "strict", 6)) {
309                         printk(KERN_INFO
310                                 "Intel-IOMMU: disable batched IOTLB flush\n");
311                         intel_iommu_strict = 1;
312                 }
313
314                 str += strcspn(str, ",");
315                 while (*str == ',')
316                         str++;
317         }
318         return 0;
319 }
320 __setup("intel_iommu=", intel_iommu_setup);
321
322 static struct kmem_cache *iommu_domain_cache;
323 static struct kmem_cache *iommu_devinfo_cache;
324 static struct kmem_cache *iommu_iova_cache;
325
326 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
327 {
328         unsigned int flags;
329         void *vaddr;
330
331         /* trying to avoid low memory issues */
332         flags = current->flags & PF_MEMALLOC;
333         current->flags |= PF_MEMALLOC;
334         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
335         current->flags &= (~PF_MEMALLOC | flags);
336         return vaddr;
337 }
338
339
340 static inline void *alloc_pgtable_page(void)
341 {
342         unsigned int flags;
343         void *vaddr;
344
345         /* trying to avoid low memory issues */
346         flags = current->flags & PF_MEMALLOC;
347         current->flags |= PF_MEMALLOC;
348         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
349         current->flags &= (~PF_MEMALLOC | flags);
350         return vaddr;
351 }
352
353 static inline void free_pgtable_page(void *vaddr)
354 {
355         free_page((unsigned long)vaddr);
356 }
357
358 static inline void *alloc_domain_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_domain_cache);
361 }
362
363 static void free_domain_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_domain_cache, vaddr);
366 }
367
368 static inline void * alloc_devinfo_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
371 }
372
373 static inline void free_devinfo_mem(void *vaddr)
374 {
375         kmem_cache_free(iommu_devinfo_cache, vaddr);
376 }
377
378 struct iova *alloc_iova_mem(void)
379 {
380         return iommu_kmem_cache_alloc(iommu_iova_cache);
381 }
382
383 void free_iova_mem(struct iova *iova)
384 {
385         kmem_cache_free(iommu_iova_cache, iova);
386 }
387
388
389 static inline int width_to_agaw(int width);
390
391 /* calculate agaw for each iommu.
392  * "SAGAW" may be different across iommus, use a default agaw, and
393  * get a supported less agaw for iommus that don't support the default agaw.
394  */
395 int iommu_calculate_agaw(struct intel_iommu *iommu)
396 {
397         unsigned long sagaw;
398         int agaw = -1;
399
400         sagaw = cap_sagaw(iommu->cap);
401         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
402              agaw >= 0; agaw--) {
403                 if (test_bit(agaw, &sagaw))
404                         break;
405         }
406
407         return agaw;
408 }
409
410 /* in native case, each domain is related to only one iommu */
411 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
412 {
413         int iommu_id;
414
415         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
416
417         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
418         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
419                 return NULL;
420
421         return g_iommus[iommu_id];
422 }
423
424 /* "Coherency" capability may be different across iommus */
425 static void domain_update_iommu_coherency(struct dmar_domain *domain)
426 {
427         int i;
428
429         domain->iommu_coherency = 1;
430
431         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
432         for (; i < g_num_of_iommus; ) {
433                 if (!ecap_coherent(g_iommus[i]->ecap)) {
434                         domain->iommu_coherency = 0;
435                         break;
436                 }
437                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
438         }
439 }
440
441 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
442 {
443         struct dmar_drhd_unit *drhd = NULL;
444         int i;
445
446         for_each_drhd_unit(drhd) {
447                 if (drhd->ignored)
448                         continue;
449
450                 for (i = 0; i < drhd->devices_cnt; i++)
451                         if (drhd->devices[i] &&
452                             drhd->devices[i]->bus->number == bus &&
453                             drhd->devices[i]->devfn == devfn)
454                                 return drhd->iommu;
455
456                 if (drhd->include_all)
457                         return drhd->iommu;
458         }
459
460         return NULL;
461 }
462
463 static void domain_flush_cache(struct dmar_domain *domain,
464                                void *addr, int size)
465 {
466         if (!domain->iommu_coherency)
467                 clflush_cache_range(addr, size);
468 }
469
470 /* Gets context entry for a given bus and devfn */
471 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
472                 u8 bus, u8 devfn)
473 {
474         struct root_entry *root;
475         struct context_entry *context;
476         unsigned long phy_addr;
477         unsigned long flags;
478
479         spin_lock_irqsave(&iommu->lock, flags);
480         root = &iommu->root_entry[bus];
481         context = get_context_addr_from_root(root);
482         if (!context) {
483                 context = (struct context_entry *)alloc_pgtable_page();
484                 if (!context) {
485                         spin_unlock_irqrestore(&iommu->lock, flags);
486                         return NULL;
487                 }
488                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
489                 phy_addr = virt_to_phys((void *)context);
490                 set_root_value(root, phy_addr);
491                 set_root_present(root);
492                 __iommu_flush_cache(iommu, root, sizeof(*root));
493         }
494         spin_unlock_irqrestore(&iommu->lock, flags);
495         return &context[devfn];
496 }
497
498 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
499 {
500         struct root_entry *root;
501         struct context_entry *context;
502         int ret;
503         unsigned long flags;
504
505         spin_lock_irqsave(&iommu->lock, flags);
506         root = &iommu->root_entry[bus];
507         context = get_context_addr_from_root(root);
508         if (!context) {
509                 ret = 0;
510                 goto out;
511         }
512         ret = context_present(&context[devfn]);
513 out:
514         spin_unlock_irqrestore(&iommu->lock, flags);
515         return ret;
516 }
517
518 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
519 {
520         struct root_entry *root;
521         struct context_entry *context;
522         unsigned long flags;
523
524         spin_lock_irqsave(&iommu->lock, flags);
525         root = &iommu->root_entry[bus];
526         context = get_context_addr_from_root(root);
527         if (context) {
528                 context_clear_entry(&context[devfn]);
529                 __iommu_flush_cache(iommu, &context[devfn], \
530                         sizeof(*context));
531         }
532         spin_unlock_irqrestore(&iommu->lock, flags);
533 }
534
535 static void free_context_table(struct intel_iommu *iommu)
536 {
537         struct root_entry *root;
538         int i;
539         unsigned long flags;
540         struct context_entry *context;
541
542         spin_lock_irqsave(&iommu->lock, flags);
543         if (!iommu->root_entry) {
544                 goto out;
545         }
546         for (i = 0; i < ROOT_ENTRY_NR; i++) {
547                 root = &iommu->root_entry[i];
548                 context = get_context_addr_from_root(root);
549                 if (context)
550                         free_pgtable_page(context);
551         }
552         free_pgtable_page(iommu->root_entry);
553         iommu->root_entry = NULL;
554 out:
555         spin_unlock_irqrestore(&iommu->lock, flags);
556 }
557
558 /* page table handling */
559 #define LEVEL_STRIDE            (9)
560 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
561
562 static inline int agaw_to_level(int agaw)
563 {
564         return agaw + 2;
565 }
566
567 static inline int agaw_to_width(int agaw)
568 {
569         return 30 + agaw * LEVEL_STRIDE;
570
571 }
572
573 static inline int width_to_agaw(int width)
574 {
575         return (width - 30) / LEVEL_STRIDE;
576 }
577
578 static inline unsigned int level_to_offset_bits(int level)
579 {
580         return (12 + (level - 1) * LEVEL_STRIDE);
581 }
582
583 static inline int address_level_offset(u64 addr, int level)
584 {
585         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
586 }
587
588 static inline u64 level_mask(int level)
589 {
590         return ((u64)-1 << level_to_offset_bits(level));
591 }
592
593 static inline u64 level_size(int level)
594 {
595         return ((u64)1 << level_to_offset_bits(level));
596 }
597
598 static inline u64 align_to_level(u64 addr, int level)
599 {
600         return ((addr + level_size(level) - 1) & level_mask(level));
601 }
602
603 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
604 {
605         int addr_width = agaw_to_width(domain->agaw);
606         struct dma_pte *parent, *pte = NULL;
607         int level = agaw_to_level(domain->agaw);
608         int offset;
609         unsigned long flags;
610
611         BUG_ON(!domain->pgd);
612
613         addr &= (((u64)1) << addr_width) - 1;
614         parent = domain->pgd;
615
616         spin_lock_irqsave(&domain->mapping_lock, flags);
617         while (level > 0) {
618                 void *tmp_page;
619
620                 offset = address_level_offset(addr, level);
621                 pte = &parent[offset];
622                 if (level == 1)
623                         break;
624
625                 if (!dma_pte_present(pte)) {
626                         tmp_page = alloc_pgtable_page();
627
628                         if (!tmp_page) {
629                                 spin_unlock_irqrestore(&domain->mapping_lock,
630                                         flags);
631                                 return NULL;
632                         }
633                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
634                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
635                         /*
636                          * high level table always sets r/w, last level page
637                          * table control read/write
638                          */
639                         dma_set_pte_readable(pte);
640                         dma_set_pte_writable(pte);
641                         domain_flush_cache(domain, pte, sizeof(*pte));
642                 }
643                 parent = phys_to_virt(dma_pte_addr(pte));
644                 level--;
645         }
646
647         spin_unlock_irqrestore(&domain->mapping_lock, flags);
648         return pte;
649 }
650
651 /* return address's pte at specific level */
652 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
653                 int level)
654 {
655         struct dma_pte *parent, *pte = NULL;
656         int total = agaw_to_level(domain->agaw);
657         int offset;
658
659         parent = domain->pgd;
660         while (level <= total) {
661                 offset = address_level_offset(addr, total);
662                 pte = &parent[offset];
663                 if (level == total)
664                         return pte;
665
666                 if (!dma_pte_present(pte))
667                         break;
668                 parent = phys_to_virt(dma_pte_addr(pte));
669                 total--;
670         }
671         return NULL;
672 }
673
674 /* clear one page's page table */
675 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
676 {
677         struct dma_pte *pte = NULL;
678
679         /* get last level pte */
680         pte = dma_addr_level_pte(domain, addr, 1);
681
682         if (pte) {
683                 dma_clear_pte(pte);
684                 domain_flush_cache(domain, pte, sizeof(*pte));
685         }
686 }
687
688 /* clear last level pte, a tlb flush should be followed */
689 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
690 {
691         int addr_width = agaw_to_width(domain->agaw);
692
693         start &= (((u64)1) << addr_width) - 1;
694         end &= (((u64)1) << addr_width) - 1;
695         /* in case it's partial page */
696         start = PAGE_ALIGN(start);
697         end &= PAGE_MASK;
698
699         /* we don't need lock here, nobody else touches the iova range */
700         while (start < end) {
701                 dma_pte_clear_one(domain, start);
702                 start += VTD_PAGE_SIZE;
703         }
704 }
705
706 /* free page table pages. last level pte should already be cleared */
707 static void dma_pte_free_pagetable(struct dmar_domain *domain,
708         u64 start, u64 end)
709 {
710         int addr_width = agaw_to_width(domain->agaw);
711         struct dma_pte *pte;
712         int total = agaw_to_level(domain->agaw);
713         int level;
714         u64 tmp;
715
716         start &= (((u64)1) << addr_width) - 1;
717         end &= (((u64)1) << addr_width) - 1;
718
719         /* we don't need lock here, nobody else touches the iova range */
720         level = 2;
721         while (level <= total) {
722                 tmp = align_to_level(start, level);
723                 if (tmp >= end || (tmp + level_size(level) > end))
724                         return;
725
726                 while (tmp < end) {
727                         pte = dma_addr_level_pte(domain, tmp, level);
728                         if (pte) {
729                                 free_pgtable_page(
730                                         phys_to_virt(dma_pte_addr(pte)));
731                                 dma_clear_pte(pte);
732                                 domain_flush_cache(domain, pte, sizeof(*pte));
733                         }
734                         tmp += level_size(level);
735                 }
736                 level++;
737         }
738         /* free pgd */
739         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
740                 free_pgtable_page(domain->pgd);
741                 domain->pgd = NULL;
742         }
743 }
744
745 /* iommu handling */
746 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
747 {
748         struct root_entry *root;
749         unsigned long flags;
750
751         root = (struct root_entry *)alloc_pgtable_page();
752         if (!root)
753                 return -ENOMEM;
754
755         __iommu_flush_cache(iommu, root, ROOT_SIZE);
756
757         spin_lock_irqsave(&iommu->lock, flags);
758         iommu->root_entry = root;
759         spin_unlock_irqrestore(&iommu->lock, flags);
760
761         return 0;
762 }
763
764 static void iommu_set_root_entry(struct intel_iommu *iommu)
765 {
766         void *addr;
767         u32 cmd, sts;
768         unsigned long flag;
769
770         addr = iommu->root_entry;
771
772         spin_lock_irqsave(&iommu->register_lock, flag);
773         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
774
775         cmd = iommu->gcmd | DMA_GCMD_SRTP;
776         writel(cmd, iommu->reg + DMAR_GCMD_REG);
777
778         /* Make sure hardware complete it */
779         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
780                 readl, (sts & DMA_GSTS_RTPS), sts);
781
782         spin_unlock_irqrestore(&iommu->register_lock, flag);
783 }
784
785 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
786 {
787         u32 val;
788         unsigned long flag;
789
790         if (!rwbf_quirk && !cap_rwbf(iommu->cap))
791                 return;
792         val = iommu->gcmd | DMA_GCMD_WBF;
793
794         spin_lock_irqsave(&iommu->register_lock, flag);
795         writel(val, iommu->reg + DMAR_GCMD_REG);
796
797         /* Make sure hardware complete it */
798         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
799                         readl, (!(val & DMA_GSTS_WBFS)), val);
800
801         spin_unlock_irqrestore(&iommu->register_lock, flag);
802 }
803
804 /* return value determine if we need a write buffer flush */
805 static int __iommu_flush_context(struct intel_iommu *iommu,
806         u16 did, u16 source_id, u8 function_mask, u64 type,
807         int non_present_entry_flush)
808 {
809         u64 val = 0;
810         unsigned long flag;
811
812         /*
813          * In the non-present entry flush case, if hardware doesn't cache
814          * non-present entry we do nothing and if hardware cache non-present
815          * entry, we flush entries of domain 0 (the domain id is used to cache
816          * any non-present entries)
817          */
818         if (non_present_entry_flush) {
819                 if (!cap_caching_mode(iommu->cap))
820                         return 1;
821                 else
822                         did = 0;
823         }
824
825         switch (type) {
826         case DMA_CCMD_GLOBAL_INVL:
827                 val = DMA_CCMD_GLOBAL_INVL;
828                 break;
829         case DMA_CCMD_DOMAIN_INVL:
830                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
831                 break;
832         case DMA_CCMD_DEVICE_INVL:
833                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
834                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
835                 break;
836         default:
837                 BUG();
838         }
839         val |= DMA_CCMD_ICC;
840
841         spin_lock_irqsave(&iommu->register_lock, flag);
842         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
843
844         /* Make sure hardware complete it */
845         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
846                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
847
848         spin_unlock_irqrestore(&iommu->register_lock, flag);
849
850         /* flush context entry will implicitly flush write buffer */
851         return 0;
852 }
853
854 /* return value determine if we need a write buffer flush */
855 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
856         u64 addr, unsigned int size_order, u64 type,
857         int non_present_entry_flush)
858 {
859         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
860         u64 val = 0, val_iva = 0;
861         unsigned long flag;
862
863         /*
864          * In the non-present entry flush case, if hardware doesn't cache
865          * non-present entry we do nothing and if hardware cache non-present
866          * entry, we flush entries of domain 0 (the domain id is used to cache
867          * any non-present entries)
868          */
869         if (non_present_entry_flush) {
870                 if (!cap_caching_mode(iommu->cap))
871                         return 1;
872                 else
873                         did = 0;
874         }
875
876         switch (type) {
877         case DMA_TLB_GLOBAL_FLUSH:
878                 /* global flush doesn't need set IVA_REG */
879                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
880                 break;
881         case DMA_TLB_DSI_FLUSH:
882                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
883                 break;
884         case DMA_TLB_PSI_FLUSH:
885                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
886                 /* Note: always flush non-leaf currently */
887                 val_iva = size_order | addr;
888                 break;
889         default:
890                 BUG();
891         }
892         /* Note: set drain read/write */
893 #if 0
894         /*
895          * This is probably to be super secure.. Looks like we can
896          * ignore it without any impact.
897          */
898         if (cap_read_drain(iommu->cap))
899                 val |= DMA_TLB_READ_DRAIN;
900 #endif
901         if (cap_write_drain(iommu->cap))
902                 val |= DMA_TLB_WRITE_DRAIN;
903
904         spin_lock_irqsave(&iommu->register_lock, flag);
905         /* Note: Only uses first TLB reg currently */
906         if (val_iva)
907                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
908         dmar_writeq(iommu->reg + tlb_offset + 8, val);
909
910         /* Make sure hardware complete it */
911         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
912                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
913
914         spin_unlock_irqrestore(&iommu->register_lock, flag);
915
916         /* check IOTLB invalidation granularity */
917         if (DMA_TLB_IAIG(val) == 0)
918                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
919         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
920                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
921                         (unsigned long long)DMA_TLB_IIRG(type),
922                         (unsigned long long)DMA_TLB_IAIG(val));
923         /* flush iotlb entry will implicitly flush write buffer */
924         return 0;
925 }
926
927 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
928         u64 addr, unsigned int pages, int non_present_entry_flush)
929 {
930         unsigned int mask;
931
932         BUG_ON(addr & (~VTD_PAGE_MASK));
933         BUG_ON(pages == 0);
934
935         /* Fallback to domain selective flush if no PSI support */
936         if (!cap_pgsel_inv(iommu->cap))
937                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938                                                 DMA_TLB_DSI_FLUSH,
939                                                 non_present_entry_flush);
940
941         /*
942          * PSI requires page size to be 2 ^ x, and the base address is naturally
943          * aligned to the size
944          */
945         mask = ilog2(__roundup_pow_of_two(pages));
946         /* Fallback to domain selective flush if size is too big */
947         if (mask > cap_max_amask_val(iommu->cap))
948                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
949                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
950
951         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
952                                         DMA_TLB_PSI_FLUSH,
953                                         non_present_entry_flush);
954 }
955
956 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
957 {
958         u32 pmen;
959         unsigned long flags;
960
961         spin_lock_irqsave(&iommu->register_lock, flags);
962         pmen = readl(iommu->reg + DMAR_PMEN_REG);
963         pmen &= ~DMA_PMEN_EPM;
964         writel(pmen, iommu->reg + DMAR_PMEN_REG);
965
966         /* wait for the protected region status bit to clear */
967         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
968                 readl, !(pmen & DMA_PMEN_PRS), pmen);
969
970         spin_unlock_irqrestore(&iommu->register_lock, flags);
971 }
972
973 static int iommu_enable_translation(struct intel_iommu *iommu)
974 {
975         u32 sts;
976         unsigned long flags;
977
978         spin_lock_irqsave(&iommu->register_lock, flags);
979         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
980
981         /* Make sure hardware complete it */
982         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
983                 readl, (sts & DMA_GSTS_TES), sts);
984
985         iommu->gcmd |= DMA_GCMD_TE;
986         spin_unlock_irqrestore(&iommu->register_lock, flags);
987         return 0;
988 }
989
990 static int iommu_disable_translation(struct intel_iommu *iommu)
991 {
992         u32 sts;
993         unsigned long flag;
994
995         spin_lock_irqsave(&iommu->register_lock, flag);
996         iommu->gcmd &= ~DMA_GCMD_TE;
997         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
998
999         /* Make sure hardware complete it */
1000         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
1001                 readl, (!(sts & DMA_GSTS_TES)), sts);
1002
1003         spin_unlock_irqrestore(&iommu->register_lock, flag);
1004         return 0;
1005 }
1006
1007 /* iommu interrupt handling. Most stuff are MSI-like. */
1008
1009 static const char *fault_reason_strings[] =
1010 {
1011         "Software",
1012         "Present bit in root entry is clear",
1013         "Present bit in context entry is clear",
1014         "Invalid context entry",
1015         "Access beyond MGAW",
1016         "PTE Write access is not set",
1017         "PTE Read access is not set",
1018         "Next page table ptr is invalid",
1019         "Root table address invalid",
1020         "Context table ptr is invalid",
1021         "non-zero reserved fields in RTP",
1022         "non-zero reserved fields in CTP",
1023         "non-zero reserved fields in PTE",
1024 };
1025 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1026
1027 const char *dmar_get_fault_reason(u8 fault_reason)
1028 {
1029         if (fault_reason > MAX_FAULT_REASON_IDX)
1030                 return "Unknown";
1031         else
1032                 return fault_reason_strings[fault_reason];
1033 }
1034
1035 void dmar_msi_unmask(unsigned int irq)
1036 {
1037         struct intel_iommu *iommu = get_irq_data(irq);
1038         unsigned long flag;
1039
1040         /* unmask it */
1041         spin_lock_irqsave(&iommu->register_lock, flag);
1042         writel(0, iommu->reg + DMAR_FECTL_REG);
1043         /* Read a reg to force flush the post write */
1044         readl(iommu->reg + DMAR_FECTL_REG);
1045         spin_unlock_irqrestore(&iommu->register_lock, flag);
1046 }
1047
1048 void dmar_msi_mask(unsigned int irq)
1049 {
1050         unsigned long flag;
1051         struct intel_iommu *iommu = get_irq_data(irq);
1052
1053         /* mask it */
1054         spin_lock_irqsave(&iommu->register_lock, flag);
1055         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1056         /* Read a reg to force flush the post write */
1057         readl(iommu->reg + DMAR_FECTL_REG);
1058         spin_unlock_irqrestore(&iommu->register_lock, flag);
1059 }
1060
1061 void dmar_msi_write(int irq, struct msi_msg *msg)
1062 {
1063         struct intel_iommu *iommu = get_irq_data(irq);
1064         unsigned long flag;
1065
1066         spin_lock_irqsave(&iommu->register_lock, flag);
1067         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1068         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1069         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1070         spin_unlock_irqrestore(&iommu->register_lock, flag);
1071 }
1072
1073 void dmar_msi_read(int irq, struct msi_msg *msg)
1074 {
1075         struct intel_iommu *iommu = get_irq_data(irq);
1076         unsigned long flag;
1077
1078         spin_lock_irqsave(&iommu->register_lock, flag);
1079         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1080         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1081         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1082         spin_unlock_irqrestore(&iommu->register_lock, flag);
1083 }
1084
1085 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1086                 u8 fault_reason, u16 source_id, unsigned long long addr)
1087 {
1088         const char *reason;
1089
1090         reason = dmar_get_fault_reason(fault_reason);
1091
1092         printk(KERN_ERR
1093                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1094                 "fault addr %llx \n"
1095                 "DMAR:[fault reason %02d] %s\n",
1096                 (type ? "DMA Read" : "DMA Write"),
1097                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1098                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1099         return 0;
1100 }
1101
1102 #define PRIMARY_FAULT_REG_LEN (16)
1103 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1104 {
1105         struct intel_iommu *iommu = dev_id;
1106         int reg, fault_index;
1107         u32 fault_status;
1108         unsigned long flag;
1109
1110         spin_lock_irqsave(&iommu->register_lock, flag);
1111         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1112
1113         /* TBD: ignore advanced fault log currently */
1114         if (!(fault_status & DMA_FSTS_PPF))
1115                 goto clear_overflow;
1116
1117         fault_index = dma_fsts_fault_record_index(fault_status);
1118         reg = cap_fault_reg_offset(iommu->cap);
1119         while (1) {
1120                 u8 fault_reason;
1121                 u16 source_id;
1122                 u64 guest_addr;
1123                 int type;
1124                 u32 data;
1125
1126                 /* highest 32 bits */
1127                 data = readl(iommu->reg + reg +
1128                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1129                 if (!(data & DMA_FRCD_F))
1130                         break;
1131
1132                 fault_reason = dma_frcd_fault_reason(data);
1133                 type = dma_frcd_type(data);
1134
1135                 data = readl(iommu->reg + reg +
1136                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1137                 source_id = dma_frcd_source_id(data);
1138
1139                 guest_addr = dmar_readq(iommu->reg + reg +
1140                                 fault_index * PRIMARY_FAULT_REG_LEN);
1141                 guest_addr = dma_frcd_page_addr(guest_addr);
1142                 /* clear the fault */
1143                 writel(DMA_FRCD_F, iommu->reg + reg +
1144                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1145
1146                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1147
1148                 iommu_page_fault_do_one(iommu, type, fault_reason,
1149                                 source_id, guest_addr);
1150
1151                 fault_index++;
1152                 if (fault_index > cap_num_fault_regs(iommu->cap))
1153                         fault_index = 0;
1154                 spin_lock_irqsave(&iommu->register_lock, flag);
1155         }
1156 clear_overflow:
1157         /* clear primary fault overflow */
1158         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1159         if (fault_status & DMA_FSTS_PFO)
1160                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1161
1162         spin_unlock_irqrestore(&iommu->register_lock, flag);
1163         return IRQ_HANDLED;
1164 }
1165
1166 int dmar_set_interrupt(struct intel_iommu *iommu)
1167 {
1168         int irq, ret;
1169
1170         irq = create_irq();
1171         if (!irq) {
1172                 printk(KERN_ERR "IOMMU: no free vectors\n");
1173                 return -EINVAL;
1174         }
1175
1176         set_irq_data(irq, iommu);
1177         iommu->irq = irq;
1178
1179         ret = arch_setup_dmar_msi(irq);
1180         if (ret) {
1181                 set_irq_data(irq, NULL);
1182                 iommu->irq = 0;
1183                 destroy_irq(irq);
1184                 return 0;
1185         }
1186
1187         /* Force fault register is cleared */
1188         iommu_page_fault(irq, iommu);
1189
1190         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1191         if (ret)
1192                 printk(KERN_ERR "IOMMU: can't request irq\n");
1193         return ret;
1194 }
1195
1196 static int iommu_init_domains(struct intel_iommu *iommu)
1197 {
1198         unsigned long ndomains;
1199         unsigned long nlongs;
1200
1201         ndomains = cap_ndoms(iommu->cap);
1202         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1203         nlongs = BITS_TO_LONGS(ndomains);
1204
1205         /* TBD: there might be 64K domains,
1206          * consider other allocation for future chip
1207          */
1208         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1209         if (!iommu->domain_ids) {
1210                 printk(KERN_ERR "Allocating domain id array failed\n");
1211                 return -ENOMEM;
1212         }
1213         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1214                         GFP_KERNEL);
1215         if (!iommu->domains) {
1216                 printk(KERN_ERR "Allocating domain array failed\n");
1217                 kfree(iommu->domain_ids);
1218                 return -ENOMEM;
1219         }
1220
1221         spin_lock_init(&iommu->lock);
1222
1223         /*
1224          * if Caching mode is set, then invalid translations are tagged
1225          * with domainid 0. Hence we need to pre-allocate it.
1226          */
1227         if (cap_caching_mode(iommu->cap))
1228                 set_bit(0, iommu->domain_ids);
1229         return 0;
1230 }
1231
1232
1233 static void domain_exit(struct dmar_domain *domain);
1234 static void vm_domain_exit(struct dmar_domain *domain);
1235
1236 void free_dmar_iommu(struct intel_iommu *iommu)
1237 {
1238         struct dmar_domain *domain;
1239         int i;
1240         unsigned long flags;
1241
1242         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1243         for (; i < cap_ndoms(iommu->cap); ) {
1244                 domain = iommu->domains[i];
1245                 clear_bit(i, iommu->domain_ids);
1246
1247                 spin_lock_irqsave(&domain->iommu_lock, flags);
1248                 if (--domain->iommu_count == 0) {
1249                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1250                                 vm_domain_exit(domain);
1251                         else
1252                                 domain_exit(domain);
1253                 }
1254                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1255
1256                 i = find_next_bit(iommu->domain_ids,
1257                         cap_ndoms(iommu->cap), i+1);
1258         }
1259
1260         if (iommu->gcmd & DMA_GCMD_TE)
1261                 iommu_disable_translation(iommu);
1262
1263         if (iommu->irq) {
1264                 set_irq_data(iommu->irq, NULL);
1265                 /* This will mask the irq */
1266                 free_irq(iommu->irq, iommu);
1267                 destroy_irq(iommu->irq);
1268         }
1269
1270         kfree(iommu->domains);
1271         kfree(iommu->domain_ids);
1272
1273         g_iommus[iommu->seq_id] = NULL;
1274
1275         /* if all iommus are freed, free g_iommus */
1276         for (i = 0; i < g_num_of_iommus; i++) {
1277                 if (g_iommus[i])
1278                         break;
1279         }
1280
1281         if (i == g_num_of_iommus)
1282                 kfree(g_iommus);
1283
1284         /* free context mapping */
1285         free_context_table(iommu);
1286 }
1287
1288 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1289 {
1290         unsigned long num;
1291         unsigned long ndomains;
1292         struct dmar_domain *domain;
1293         unsigned long flags;
1294
1295         domain = alloc_domain_mem();
1296         if (!domain)
1297                 return NULL;
1298
1299         ndomains = cap_ndoms(iommu->cap);
1300
1301         spin_lock_irqsave(&iommu->lock, flags);
1302         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1303         if (num >= ndomains) {
1304                 spin_unlock_irqrestore(&iommu->lock, flags);
1305                 free_domain_mem(domain);
1306                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1307                 return NULL;
1308         }
1309
1310         set_bit(num, iommu->domain_ids);
1311         domain->id = num;
1312         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1313         set_bit(iommu->seq_id, &domain->iommu_bmp);
1314         domain->flags = 0;
1315         iommu->domains[num] = domain;
1316         spin_unlock_irqrestore(&iommu->lock, flags);
1317
1318         return domain;
1319 }
1320
1321 static void iommu_free_domain(struct dmar_domain *domain)
1322 {
1323         unsigned long flags;
1324         struct intel_iommu *iommu;
1325
1326         iommu = domain_get_iommu(domain);
1327
1328         spin_lock_irqsave(&iommu->lock, flags);
1329         clear_bit(domain->id, iommu->domain_ids);
1330         spin_unlock_irqrestore(&iommu->lock, flags);
1331 }
1332
1333 static struct iova_domain reserved_iova_list;
1334 static struct lock_class_key reserved_alloc_key;
1335 static struct lock_class_key reserved_rbtree_key;
1336
1337 static void dmar_init_reserved_ranges(void)
1338 {
1339         struct pci_dev *pdev = NULL;
1340         struct iova *iova;
1341         int i;
1342         u64 addr, size;
1343
1344         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1345
1346         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1347                 &reserved_alloc_key);
1348         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1349                 &reserved_rbtree_key);
1350
1351         /* IOAPIC ranges shouldn't be accessed by DMA */
1352         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1353                 IOVA_PFN(IOAPIC_RANGE_END));
1354         if (!iova)
1355                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1356
1357         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1358         for_each_pci_dev(pdev) {
1359                 struct resource *r;
1360
1361                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1362                         r = &pdev->resource[i];
1363                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1364                                 continue;
1365                         addr = r->start;
1366                         addr &= PAGE_MASK;
1367                         size = r->end - addr;
1368                         size = PAGE_ALIGN(size);
1369                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1370                                 IOVA_PFN(size + addr) - 1);
1371                         if (!iova)
1372                                 printk(KERN_ERR "Reserve iova failed\n");
1373                 }
1374         }
1375
1376 }
1377
1378 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1379 {
1380         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1381 }
1382
1383 static inline int guestwidth_to_adjustwidth(int gaw)
1384 {
1385         int agaw;
1386         int r = (gaw - 12) % 9;
1387
1388         if (r == 0)
1389                 agaw = gaw;
1390         else
1391                 agaw = gaw + 9 - r;
1392         if (agaw > 64)
1393                 agaw = 64;
1394         return agaw;
1395 }
1396
1397 static int domain_init(struct dmar_domain *domain, int guest_width)
1398 {
1399         struct intel_iommu *iommu;
1400         int adjust_width, agaw;
1401         unsigned long sagaw;
1402
1403         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1404         spin_lock_init(&domain->mapping_lock);
1405         spin_lock_init(&domain->iommu_lock);
1406
1407         domain_reserve_special_ranges(domain);
1408
1409         /* calculate AGAW */
1410         iommu = domain_get_iommu(domain);
1411         if (guest_width > cap_mgaw(iommu->cap))
1412                 guest_width = cap_mgaw(iommu->cap);
1413         domain->gaw = guest_width;
1414         adjust_width = guestwidth_to_adjustwidth(guest_width);
1415         agaw = width_to_agaw(adjust_width);
1416         sagaw = cap_sagaw(iommu->cap);
1417         if (!test_bit(agaw, &sagaw)) {
1418                 /* hardware doesn't support it, choose a bigger one */
1419                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1420                 agaw = find_next_bit(&sagaw, 5, agaw);
1421                 if (agaw >= 5)
1422                         return -ENODEV;
1423         }
1424         domain->agaw = agaw;
1425         INIT_LIST_HEAD(&domain->devices);
1426
1427         if (ecap_coherent(iommu->ecap))
1428                 domain->iommu_coherency = 1;
1429         else
1430                 domain->iommu_coherency = 0;
1431
1432         domain->iommu_count = 1;
1433
1434         /* always allocate the top pgd */
1435         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1436         if (!domain->pgd)
1437                 return -ENOMEM;
1438         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1439         return 0;
1440 }
1441
1442 static void domain_exit(struct dmar_domain *domain)
1443 {
1444         u64 end;
1445
1446         /* Domain 0 is reserved, so dont process it */
1447         if (!domain)
1448                 return;
1449
1450         domain_remove_dev_info(domain);
1451         /* destroy iovas */
1452         put_iova_domain(&domain->iovad);
1453         end = DOMAIN_MAX_ADDR(domain->gaw);
1454         end = end & (~PAGE_MASK);
1455
1456         /* clear ptes */
1457         dma_pte_clear_range(domain, 0, end);
1458
1459         /* free page tables */
1460         dma_pte_free_pagetable(domain, 0, end);
1461
1462         iommu_free_domain(domain);
1463         free_domain_mem(domain);
1464 }
1465
1466 static int domain_context_mapping_one(struct dmar_domain *domain,
1467                 u8 bus, u8 devfn)
1468 {
1469         struct context_entry *context;
1470         unsigned long flags;
1471         struct intel_iommu *iommu;
1472         struct dma_pte *pgd;
1473         unsigned long num;
1474         unsigned long ndomains;
1475         int id;
1476         int agaw;
1477
1478         pr_debug("Set context mapping for %02x:%02x.%d\n",
1479                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1480         BUG_ON(!domain->pgd);
1481
1482         iommu = device_to_iommu(bus, devfn);
1483         if (!iommu)
1484                 return -ENODEV;
1485
1486         context = device_to_context_entry(iommu, bus, devfn);
1487         if (!context)
1488                 return -ENOMEM;
1489         spin_lock_irqsave(&iommu->lock, flags);
1490         if (context_present(context)) {
1491                 spin_unlock_irqrestore(&iommu->lock, flags);
1492                 return 0;
1493         }
1494
1495         id = domain->id;
1496         pgd = domain->pgd;
1497
1498         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1499                 int found = 0;
1500
1501                 /* find an available domain id for this device in iommu */
1502                 ndomains = cap_ndoms(iommu->cap);
1503                 num = find_first_bit(iommu->domain_ids, ndomains);
1504                 for (; num < ndomains; ) {
1505                         if (iommu->domains[num] == domain) {
1506                                 id = num;
1507                                 found = 1;
1508                                 break;
1509                         }
1510                         num = find_next_bit(iommu->domain_ids,
1511                                             cap_ndoms(iommu->cap), num+1);
1512                 }
1513
1514                 if (found == 0) {
1515                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1516                         if (num >= ndomains) {
1517                                 spin_unlock_irqrestore(&iommu->lock, flags);
1518                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1519                                 return -EFAULT;
1520                         }
1521
1522                         set_bit(num, iommu->domain_ids);
1523                         iommu->domains[num] = domain;
1524                         id = num;
1525                 }
1526
1527                 /* Skip top levels of page tables for
1528                  * iommu which has less agaw than default.
1529                  */
1530                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1531                         pgd = phys_to_virt(dma_pte_addr(pgd));
1532                         if (!dma_pte_present(pgd)) {
1533                                 spin_unlock_irqrestore(&iommu->lock, flags);
1534                                 return -ENOMEM;
1535                         }
1536                 }
1537         }
1538
1539         context_set_domain_id(context, id);
1540         context_set_address_width(context, iommu->agaw);
1541         context_set_address_root(context, virt_to_phys(pgd));
1542         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1543         context_set_fault_enable(context);
1544         context_set_present(context);
1545         domain_flush_cache(domain, context, sizeof(*context));
1546
1547         /* it's a non-present to present mapping */
1548         if (iommu->flush.flush_context(iommu, domain->id,
1549                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1550                 DMA_CCMD_DEVICE_INVL, 1))
1551                 iommu_flush_write_buffer(iommu);
1552         else
1553                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1554
1555         spin_unlock_irqrestore(&iommu->lock, flags);
1556
1557         spin_lock_irqsave(&domain->iommu_lock, flags);
1558         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1559                 domain->iommu_count++;
1560                 domain_update_iommu_coherency(domain);
1561         }
1562         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1563         return 0;
1564 }
1565
1566 static int
1567 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1568 {
1569         int ret;
1570         struct pci_dev *tmp, *parent;
1571
1572         ret = domain_context_mapping_one(domain, pdev->bus->number,
1573                 pdev->devfn);
1574         if (ret)
1575                 return ret;
1576
1577         /* dependent device mapping */
1578         tmp = pci_find_upstream_pcie_bridge(pdev);
1579         if (!tmp)
1580                 return 0;
1581         /* Secondary interface's bus number and devfn 0 */
1582         parent = pdev->bus->self;
1583         while (parent != tmp) {
1584                 ret = domain_context_mapping_one(domain, parent->bus->number,
1585                         parent->devfn);
1586                 if (ret)
1587                         return ret;
1588                 parent = parent->bus->self;
1589         }
1590         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1591                 return domain_context_mapping_one(domain,
1592                         tmp->subordinate->number, 0);
1593         else /* this is a legacy PCI bridge */
1594                 return domain_context_mapping_one(domain,
1595                         tmp->bus->number, tmp->devfn);
1596 }
1597
1598 static int domain_context_mapped(struct pci_dev *pdev)
1599 {
1600         int ret;
1601         struct pci_dev *tmp, *parent;
1602         struct intel_iommu *iommu;
1603
1604         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1605         if (!iommu)
1606                 return -ENODEV;
1607
1608         ret = device_context_mapped(iommu,
1609                 pdev->bus->number, pdev->devfn);
1610         if (!ret)
1611                 return ret;
1612         /* dependent device mapping */
1613         tmp = pci_find_upstream_pcie_bridge(pdev);
1614         if (!tmp)
1615                 return ret;
1616         /* Secondary interface's bus number and devfn 0 */
1617         parent = pdev->bus->self;
1618         while (parent != tmp) {
1619                 ret = device_context_mapped(iommu, parent->bus->number,
1620                         parent->devfn);
1621                 if (!ret)
1622                         return ret;
1623                 parent = parent->bus->self;
1624         }
1625         if (tmp->is_pcie)
1626                 return device_context_mapped(iommu,
1627                         tmp->subordinate->number, 0);
1628         else
1629                 return device_context_mapped(iommu,
1630                         tmp->bus->number, tmp->devfn);
1631 }
1632
1633 static int
1634 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1635                         u64 hpa, size_t size, int prot)
1636 {
1637         u64 start_pfn, end_pfn;
1638         struct dma_pte *pte;
1639         int index;
1640         int addr_width = agaw_to_width(domain->agaw);
1641
1642         hpa &= (((u64)1) << addr_width) - 1;
1643
1644         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1645                 return -EINVAL;
1646         iova &= PAGE_MASK;
1647         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1648         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1649         index = 0;
1650         while (start_pfn < end_pfn) {
1651                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1652                 if (!pte)
1653                         return -ENOMEM;
1654                 /* We don't need lock here, nobody else
1655                  * touches the iova range
1656                  */
1657                 BUG_ON(dma_pte_addr(pte));
1658                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1659                 dma_set_pte_prot(pte, prot);
1660                 domain_flush_cache(domain, pte, sizeof(*pte));
1661                 start_pfn++;
1662                 index++;
1663         }
1664         return 0;
1665 }
1666
1667 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1668 {
1669         if (!iommu)
1670                 return;
1671
1672         clear_context_table(iommu, bus, devfn);
1673         iommu->flush.flush_context(iommu, 0, 0, 0,
1674                                            DMA_CCMD_GLOBAL_INVL, 0);
1675         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1676                                          DMA_TLB_GLOBAL_FLUSH, 0);
1677 }
1678
1679 static void domain_remove_dev_info(struct dmar_domain *domain)
1680 {
1681         struct device_domain_info *info;
1682         unsigned long flags;
1683         struct intel_iommu *iommu;
1684
1685         spin_lock_irqsave(&device_domain_lock, flags);
1686         while (!list_empty(&domain->devices)) {
1687                 info = list_entry(domain->devices.next,
1688                         struct device_domain_info, link);
1689                 list_del(&info->link);
1690                 list_del(&info->global);
1691                 if (info->dev)
1692                         info->dev->dev.archdata.iommu = NULL;
1693                 spin_unlock_irqrestore(&device_domain_lock, flags);
1694
1695                 iommu = device_to_iommu(info->bus, info->devfn);
1696                 iommu_detach_dev(iommu, info->bus, info->devfn);
1697                 free_devinfo_mem(info);
1698
1699                 spin_lock_irqsave(&device_domain_lock, flags);
1700         }
1701         spin_unlock_irqrestore(&device_domain_lock, flags);
1702 }
1703
1704 /*
1705  * find_domain
1706  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1707  */
1708 static struct dmar_domain *
1709 find_domain(struct pci_dev *pdev)
1710 {
1711         struct device_domain_info *info;
1712
1713         /* No lock here, assumes no domain exit in normal case */
1714         info = pdev->dev.archdata.iommu;
1715         if (info)
1716                 return info->domain;
1717         return NULL;
1718 }
1719
1720 /* domain is initialized */
1721 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1722 {
1723         struct dmar_domain *domain, *found = NULL;
1724         struct intel_iommu *iommu;
1725         struct dmar_drhd_unit *drhd;
1726         struct device_domain_info *info, *tmp;
1727         struct pci_dev *dev_tmp;
1728         unsigned long flags;
1729         int bus = 0, devfn = 0;
1730
1731         domain = find_domain(pdev);
1732         if (domain)
1733                 return domain;
1734
1735         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1736         if (dev_tmp) {
1737                 if (dev_tmp->is_pcie) {
1738                         bus = dev_tmp->subordinate->number;
1739                         devfn = 0;
1740                 } else {
1741                         bus = dev_tmp->bus->number;
1742                         devfn = dev_tmp->devfn;
1743                 }
1744                 spin_lock_irqsave(&device_domain_lock, flags);
1745                 list_for_each_entry(info, &device_domain_list, global) {
1746                         if (info->bus == bus && info->devfn == devfn) {
1747                                 found = info->domain;
1748                                 break;
1749                         }
1750                 }
1751                 spin_unlock_irqrestore(&device_domain_lock, flags);
1752                 /* pcie-pci bridge already has a domain, uses it */
1753                 if (found) {
1754                         domain = found;
1755                         goto found_domain;
1756                 }
1757         }
1758
1759         /* Allocate new domain for the device */
1760         drhd = dmar_find_matched_drhd_unit(pdev);
1761         if (!drhd) {
1762                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1763                         pci_name(pdev));
1764                 return NULL;
1765         }
1766         iommu = drhd->iommu;
1767
1768         domain = iommu_alloc_domain(iommu);
1769         if (!domain)
1770                 goto error;
1771
1772         if (domain_init(domain, gaw)) {
1773                 domain_exit(domain);
1774                 goto error;
1775         }
1776
1777         /* register pcie-to-pci device */
1778         if (dev_tmp) {
1779                 info = alloc_devinfo_mem();
1780                 if (!info) {
1781                         domain_exit(domain);
1782                         goto error;
1783                 }
1784                 info->bus = bus;
1785                 info->devfn = devfn;
1786                 info->dev = NULL;
1787                 info->domain = domain;
1788                 /* This domain is shared by devices under p2p bridge */
1789                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1790
1791                 /* pcie-to-pci bridge already has a domain, uses it */
1792                 found = NULL;
1793                 spin_lock_irqsave(&device_domain_lock, flags);
1794                 list_for_each_entry(tmp, &device_domain_list, global) {
1795                         if (tmp->bus == bus && tmp->devfn == devfn) {
1796                                 found = tmp->domain;
1797                                 break;
1798                         }
1799                 }
1800                 if (found) {
1801                         free_devinfo_mem(info);
1802                         domain_exit(domain);
1803                         domain = found;
1804                 } else {
1805                         list_add(&info->link, &domain->devices);
1806                         list_add(&info->global, &device_domain_list);
1807                 }
1808                 spin_unlock_irqrestore(&device_domain_lock, flags);
1809         }
1810
1811 found_domain:
1812         info = alloc_devinfo_mem();
1813         if (!info)
1814                 goto error;
1815         info->bus = pdev->bus->number;
1816         info->devfn = pdev->devfn;
1817         info->dev = pdev;
1818         info->domain = domain;
1819         spin_lock_irqsave(&device_domain_lock, flags);
1820         /* somebody is fast */
1821         found = find_domain(pdev);
1822         if (found != NULL) {
1823                 spin_unlock_irqrestore(&device_domain_lock, flags);
1824                 if (found != domain) {
1825                         domain_exit(domain);
1826                         domain = found;
1827                 }
1828                 free_devinfo_mem(info);
1829                 return domain;
1830         }
1831         list_add(&info->link, &domain->devices);
1832         list_add(&info->global, &device_domain_list);
1833         pdev->dev.archdata.iommu = info;
1834         spin_unlock_irqrestore(&device_domain_lock, flags);
1835         return domain;
1836 error:
1837         /* recheck it here, maybe others set it */
1838         return find_domain(pdev);
1839 }
1840
1841 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1842                                       unsigned long long start,
1843                                       unsigned long long end)
1844 {
1845         struct dmar_domain *domain;
1846         unsigned long size;
1847         unsigned long long base;
1848         int ret;
1849
1850         printk(KERN_INFO
1851                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1852                 pci_name(pdev), start, end);
1853         /* page table init */
1854         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1855         if (!domain)
1856                 return -ENOMEM;
1857
1858         /* The address might not be aligned */
1859         base = start & PAGE_MASK;
1860         size = end - base;
1861         size = PAGE_ALIGN(size);
1862         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1863                         IOVA_PFN(base + size) - 1)) {
1864                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1865                 ret = -ENOMEM;
1866                 goto error;
1867         }
1868
1869         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1870                 size, base, pci_name(pdev));
1871         /*
1872          * RMRR range might have overlap with physical memory range,
1873          * clear it first
1874          */
1875         dma_pte_clear_range(domain, base, base + size);
1876
1877         ret = domain_page_mapping(domain, base, base, size,
1878                 DMA_PTE_READ|DMA_PTE_WRITE);
1879         if (ret)
1880                 goto error;
1881
1882         /* context entry init */
1883         ret = domain_context_mapping(domain, pdev);
1884         if (!ret)
1885                 return 0;
1886 error:
1887         domain_exit(domain);
1888         return ret;
1889
1890 }
1891
1892 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1893         struct pci_dev *pdev)
1894 {
1895         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1896                 return 0;
1897         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1898                 rmrr->end_address + 1);
1899 }
1900
1901 #ifdef CONFIG_DMAR_GFX_WA
1902 struct iommu_prepare_data {
1903         struct pci_dev *pdev;
1904         int ret;
1905 };
1906
1907 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1908                                          unsigned long end_pfn, void *datax)
1909 {
1910         struct iommu_prepare_data *data;
1911
1912         data = (struct iommu_prepare_data *)datax;
1913
1914         data->ret = iommu_prepare_identity_map(data->pdev,
1915                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1916         return data->ret;
1917
1918 }
1919
1920 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1921 {
1922         int nid;
1923         struct iommu_prepare_data data;
1924
1925         data.pdev = pdev;
1926         data.ret = 0;
1927
1928         for_each_online_node(nid) {
1929                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1930                 if (data.ret)
1931                         return data.ret;
1932         }
1933         return data.ret;
1934 }
1935
1936 static void __init iommu_prepare_gfx_mapping(void)
1937 {
1938         struct pci_dev *pdev = NULL;
1939         int ret;
1940
1941         for_each_pci_dev(pdev) {
1942                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1943                                 !IS_GFX_DEVICE(pdev))
1944                         continue;
1945                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1946                         pci_name(pdev));
1947                 ret = iommu_prepare_with_active_regions(pdev);
1948                 if (ret)
1949                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1950         }
1951 }
1952 #else /* !CONFIG_DMAR_GFX_WA */
1953 static inline void iommu_prepare_gfx_mapping(void)
1954 {
1955         return;
1956 }
1957 #endif
1958
1959 #ifdef CONFIG_DMAR_FLOPPY_WA
1960 static inline void iommu_prepare_isa(void)
1961 {
1962         struct pci_dev *pdev;
1963         int ret;
1964
1965         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1966         if (!pdev)
1967                 return;
1968
1969         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1970         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1971
1972         if (ret)
1973                 printk("IOMMU: Failed to create 0-64M identity map, "
1974                         "floppy might not work\n");
1975
1976 }
1977 #else
1978 static inline void iommu_prepare_isa(void)
1979 {
1980         return;
1981 }
1982 #endif /* !CONFIG_DMAR_FLPY_WA */
1983
1984 static int __init init_dmars(void)
1985 {
1986         struct dmar_drhd_unit *drhd;
1987         struct dmar_rmrr_unit *rmrr;
1988         struct pci_dev *pdev;
1989         struct intel_iommu *iommu;
1990         int i, ret, unit = 0;
1991
1992         /*
1993          * for each drhd
1994          *    allocate root
1995          *    initialize and program root entry to not present
1996          * endfor
1997          */
1998         for_each_drhd_unit(drhd) {
1999                 g_num_of_iommus++;
2000                 /*
2001                  * lock not needed as this is only incremented in the single
2002                  * threaded kernel __init code path all other access are read
2003                  * only
2004                  */
2005         }
2006
2007         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
2008                         GFP_KERNEL);
2009         if (!g_iommus) {
2010                 printk(KERN_ERR "Allocating global iommu array failed\n");
2011                 ret = -ENOMEM;
2012                 goto error;
2013         }
2014
2015         deferred_flush = kzalloc(g_num_of_iommus *
2016                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2017         if (!deferred_flush) {
2018                 kfree(g_iommus);
2019                 ret = -ENOMEM;
2020                 goto error;
2021         }
2022
2023         for_each_drhd_unit(drhd) {
2024                 if (drhd->ignored)
2025                         continue;
2026
2027                 iommu = drhd->iommu;
2028                 g_iommus[iommu->seq_id] = iommu;
2029
2030                 ret = iommu_init_domains(iommu);
2031                 if (ret)
2032                         goto error;
2033
2034                 /*
2035                  * TBD:
2036                  * we could share the same root & context tables
2037                  * amoung all IOMMU's. Need to Split it later.
2038                  */
2039                 ret = iommu_alloc_root_entry(iommu);
2040                 if (ret) {
2041                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2042                         goto error;
2043                 }
2044         }
2045
2046         for_each_drhd_unit(drhd) {
2047                 if (drhd->ignored)
2048                         continue;
2049
2050                 iommu = drhd->iommu;
2051                 if (dmar_enable_qi(iommu)) {
2052                         /*
2053                          * Queued Invalidate not enabled, use Register Based
2054                          * Invalidate
2055                          */
2056                         iommu->flush.flush_context = __iommu_flush_context;
2057                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2058                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2059                                "invalidation\n",
2060                                (unsigned long long)drhd->reg_base_addr);
2061                 } else {
2062                         iommu->flush.flush_context = qi_flush_context;
2063                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2064                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2065                                "invalidation\n",
2066                                (unsigned long long)drhd->reg_base_addr);
2067                 }
2068         }
2069
2070         /*
2071          * For each rmrr
2072          *   for each dev attached to rmrr
2073          *   do
2074          *     locate drhd for dev, alloc domain for dev
2075          *     allocate free domain
2076          *     allocate page table entries for rmrr
2077          *     if context not allocated for bus
2078          *           allocate and init context
2079          *           set present in root table for this bus
2080          *     init context with domain, translation etc
2081          *    endfor
2082          * endfor
2083          */
2084         for_each_rmrr_units(rmrr) {
2085                 for (i = 0; i < rmrr->devices_cnt; i++) {
2086                         pdev = rmrr->devices[i];
2087                         /* some BIOS lists non-exist devices in DMAR table */
2088                         if (!pdev)
2089                                 continue;
2090                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2091                         if (ret)
2092                                 printk(KERN_ERR
2093                                  "IOMMU: mapping reserved region failed\n");
2094                 }
2095         }
2096
2097         iommu_prepare_gfx_mapping();
2098
2099         iommu_prepare_isa();
2100
2101         /*
2102          * for each drhd
2103          *   enable fault log
2104          *   global invalidate context cache
2105          *   global invalidate iotlb
2106          *   enable translation
2107          */
2108         for_each_drhd_unit(drhd) {
2109                 if (drhd->ignored)
2110                         continue;
2111                 iommu = drhd->iommu;
2112                 sprintf (iommu->name, "dmar%d", unit++);
2113
2114                 iommu_flush_write_buffer(iommu);
2115
2116                 ret = dmar_set_interrupt(iommu);
2117                 if (ret)
2118                         goto error;
2119
2120                 iommu_set_root_entry(iommu);
2121
2122                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2123                                            0);
2124                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2125                                          0);
2126                 iommu_disable_protect_mem_regions(iommu);
2127
2128                 ret = iommu_enable_translation(iommu);
2129                 if (ret)
2130                         goto error;
2131         }
2132
2133         return 0;
2134 error:
2135         for_each_drhd_unit(drhd) {
2136                 if (drhd->ignored)
2137                         continue;
2138                 iommu = drhd->iommu;
2139                 free_iommu(iommu);
2140         }
2141         kfree(g_iommus);
2142         return ret;
2143 }
2144
2145 static inline u64 aligned_size(u64 host_addr, size_t size)
2146 {
2147         u64 addr;
2148         addr = (host_addr & (~PAGE_MASK)) + size;
2149         return PAGE_ALIGN(addr);
2150 }
2151
2152 struct iova *
2153 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2154 {
2155         struct iova *piova;
2156
2157         /* Make sure it's in range */
2158         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2159         if (!size || (IOVA_START_ADDR + size > end))
2160                 return NULL;
2161
2162         piova = alloc_iova(&domain->iovad,
2163                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2164         return piova;
2165 }
2166
2167 static struct iova *
2168 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2169                    size_t size, u64 dma_mask)
2170 {
2171         struct pci_dev *pdev = to_pci_dev(dev);
2172         struct iova *iova = NULL;
2173
2174         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2175                 iova = iommu_alloc_iova(domain, size, dma_mask);
2176         else {
2177                 /*
2178                  * First try to allocate an io virtual address in
2179                  * DMA_32BIT_MASK and if that fails then try allocating
2180                  * from higher range
2181                  */
2182                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2183                 if (!iova)
2184                         iova = iommu_alloc_iova(domain, size, dma_mask);
2185         }
2186
2187         if (!iova) {
2188                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2189                 return NULL;
2190         }
2191
2192         return iova;
2193 }
2194
2195 static struct dmar_domain *
2196 get_valid_domain_for_dev(struct pci_dev *pdev)
2197 {
2198         struct dmar_domain *domain;
2199         int ret;
2200
2201         domain = get_domain_for_dev(pdev,
2202                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2203         if (!domain) {
2204                 printk(KERN_ERR
2205                         "Allocating domain for %s failed", pci_name(pdev));
2206                 return NULL;
2207         }
2208
2209         /* make sure context mapping is ok */
2210         if (unlikely(!domain_context_mapped(pdev))) {
2211                 ret = domain_context_mapping(domain, pdev);
2212                 if (ret) {
2213                         printk(KERN_ERR
2214                                 "Domain context map for %s failed",
2215                                 pci_name(pdev));
2216                         return NULL;
2217                 }
2218         }
2219
2220         return domain;
2221 }
2222
2223 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2224                                      size_t size, int dir, u64 dma_mask)
2225 {
2226         struct pci_dev *pdev = to_pci_dev(hwdev);
2227         struct dmar_domain *domain;
2228         phys_addr_t start_paddr;
2229         struct iova *iova;
2230         int prot = 0;
2231         int ret;
2232         struct intel_iommu *iommu;
2233
2234         BUG_ON(dir == DMA_NONE);
2235         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2236                 return paddr;
2237
2238         domain = get_valid_domain_for_dev(pdev);
2239         if (!domain)
2240                 return 0;
2241
2242         iommu = domain_get_iommu(domain);
2243         size = aligned_size((u64)paddr, size);
2244
2245         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2246         if (!iova)
2247                 goto error;
2248
2249         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2250
2251         /*
2252          * Check if DMAR supports zero-length reads on write only
2253          * mappings..
2254          */
2255         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2256                         !cap_zlr(iommu->cap))
2257                 prot |= DMA_PTE_READ;
2258         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2259                 prot |= DMA_PTE_WRITE;
2260         /*
2261          * paddr - (paddr + size) might be partial page, we should map the whole
2262          * page.  Note: if two part of one page are separately mapped, we
2263          * might have two guest_addr mapping to the same host paddr, but this
2264          * is not a big problem
2265          */
2266         ret = domain_page_mapping(domain, start_paddr,
2267                 ((u64)paddr) & PAGE_MASK, size, prot);
2268         if (ret)
2269                 goto error;
2270
2271         /* it's a non-present to present mapping */
2272         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2273                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2274         if (ret)
2275                 iommu_flush_write_buffer(iommu);
2276
2277         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2278
2279 error:
2280         if (iova)
2281                 __free_iova(&domain->iovad, iova);
2282         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2283                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2284         return 0;
2285 }
2286
2287 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2288                             size_t size, int dir)
2289 {
2290         return __intel_map_single(hwdev, paddr, size, dir,
2291                                   to_pci_dev(hwdev)->dma_mask);
2292 }
2293
2294 static void flush_unmaps(void)
2295 {
2296         int i, j;
2297
2298         timer_on = 0;
2299
2300         /* just flush them all */
2301         for (i = 0; i < g_num_of_iommus; i++) {
2302                 struct intel_iommu *iommu = g_iommus[i];
2303                 if (!iommu)
2304                         continue;
2305
2306                 if (deferred_flush[i].next) {
2307                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2308                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2309                         for (j = 0; j < deferred_flush[i].next; j++) {
2310                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2311                                                 deferred_flush[i].iova[j]);
2312                         }
2313                         deferred_flush[i].next = 0;
2314                 }
2315         }
2316
2317         list_size = 0;
2318 }
2319
2320 static void flush_unmaps_timeout(unsigned long data)
2321 {
2322         unsigned long flags;
2323
2324         spin_lock_irqsave(&async_umap_flush_lock, flags);
2325         flush_unmaps();
2326         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2327 }
2328
2329 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2330 {
2331         unsigned long flags;
2332         int next, iommu_id;
2333         struct intel_iommu *iommu;
2334
2335         spin_lock_irqsave(&async_umap_flush_lock, flags);
2336         if (list_size == HIGH_WATER_MARK)
2337                 flush_unmaps();
2338
2339         iommu = domain_get_iommu(dom);
2340         iommu_id = iommu->seq_id;
2341
2342         next = deferred_flush[iommu_id].next;
2343         deferred_flush[iommu_id].domain[next] = dom;
2344         deferred_flush[iommu_id].iova[next] = iova;
2345         deferred_flush[iommu_id].next++;
2346
2347         if (!timer_on) {
2348                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2349                 timer_on = 1;
2350         }
2351         list_size++;
2352         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2353 }
2354
2355 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2356                         int dir)
2357 {
2358         struct pci_dev *pdev = to_pci_dev(dev);
2359         struct dmar_domain *domain;
2360         unsigned long start_addr;
2361         struct iova *iova;
2362         struct intel_iommu *iommu;
2363
2364         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2365                 return;
2366         domain = find_domain(pdev);
2367         BUG_ON(!domain);
2368
2369         iommu = domain_get_iommu(domain);
2370
2371         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2372         if (!iova)
2373                 return;
2374
2375         start_addr = iova->pfn_lo << PAGE_SHIFT;
2376         size = aligned_size((u64)dev_addr, size);
2377
2378         pr_debug("Device %s unmapping: %lx@%llx\n",
2379                 pci_name(pdev), size, (unsigned long long)start_addr);
2380
2381         /*  clear the whole page */
2382         dma_pte_clear_range(domain, start_addr, start_addr + size);
2383         /* free page tables */
2384         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2385         if (intel_iommu_strict) {
2386                 if (iommu_flush_iotlb_psi(iommu,
2387                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2388                         iommu_flush_write_buffer(iommu);
2389                 /* free iova */
2390                 __free_iova(&domain->iovad, iova);
2391         } else {
2392                 add_unmap(domain, iova);
2393                 /*
2394                  * queue up the release of the unmap to save the 1/6th of the
2395                  * cpu used up by the iotlb flush operation...
2396                  */
2397         }
2398 }
2399
2400 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2401                            dma_addr_t *dma_handle, gfp_t flags)
2402 {
2403         void *vaddr;
2404         int order;
2405
2406         size = PAGE_ALIGN(size);
2407         order = get_order(size);
2408         flags &= ~(GFP_DMA | GFP_DMA32);
2409
2410         vaddr = (void *)__get_free_pages(flags, order);
2411         if (!vaddr)
2412                 return NULL;
2413         memset(vaddr, 0, size);
2414
2415         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2416                                          DMA_BIDIRECTIONAL,
2417                                          hwdev->coherent_dma_mask);
2418         if (*dma_handle)
2419                 return vaddr;
2420         free_pages((unsigned long)vaddr, order);
2421         return NULL;
2422 }
2423
2424 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2425                          dma_addr_t dma_handle)
2426 {
2427         int order;
2428
2429         size = PAGE_ALIGN(size);
2430         order = get_order(size);
2431
2432         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2433         free_pages((unsigned long)vaddr, order);
2434 }
2435
2436 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2437
2438 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2439                     int nelems, int dir)
2440 {
2441         int i;
2442         struct pci_dev *pdev = to_pci_dev(hwdev);
2443         struct dmar_domain *domain;
2444         unsigned long start_addr;
2445         struct iova *iova;
2446         size_t size = 0;
2447         void *addr;
2448         struct scatterlist *sg;
2449         struct intel_iommu *iommu;
2450
2451         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2452                 return;
2453
2454         domain = find_domain(pdev);
2455         BUG_ON(!domain);
2456
2457         iommu = domain_get_iommu(domain);
2458
2459         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2460         if (!iova)
2461                 return;
2462         for_each_sg(sglist, sg, nelems, i) {
2463                 addr = SG_ENT_VIRT_ADDRESS(sg);
2464                 size += aligned_size((u64)addr, sg->length);
2465         }
2466
2467         start_addr = iova->pfn_lo << PAGE_SHIFT;
2468
2469         /*  clear the whole page */
2470         dma_pte_clear_range(domain, start_addr, start_addr + size);
2471         /* free page tables */
2472         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2473
2474         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2475                         size >> VTD_PAGE_SHIFT, 0))
2476                 iommu_flush_write_buffer(iommu);
2477
2478         /* free iova */
2479         __free_iova(&domain->iovad, iova);
2480 }
2481
2482 static int intel_nontranslate_map_sg(struct device *hddev,
2483         struct scatterlist *sglist, int nelems, int dir)
2484 {
2485         int i;
2486         struct scatterlist *sg;
2487
2488         for_each_sg(sglist, sg, nelems, i) {
2489                 BUG_ON(!sg_page(sg));
2490                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2491                 sg->dma_length = sg->length;
2492         }
2493         return nelems;
2494 }
2495
2496 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2497                  int dir)
2498 {
2499         void *addr;
2500         int i;
2501         struct pci_dev *pdev = to_pci_dev(hwdev);
2502         struct dmar_domain *domain;
2503         size_t size = 0;
2504         int prot = 0;
2505         size_t offset = 0;
2506         struct iova *iova = NULL;
2507         int ret;
2508         struct scatterlist *sg;
2509         unsigned long start_addr;
2510         struct intel_iommu *iommu;
2511
2512         BUG_ON(dir == DMA_NONE);
2513         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2514                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2515
2516         domain = get_valid_domain_for_dev(pdev);
2517         if (!domain)
2518                 return 0;
2519
2520         iommu = domain_get_iommu(domain);
2521
2522         for_each_sg(sglist, sg, nelems, i) {
2523                 addr = SG_ENT_VIRT_ADDRESS(sg);
2524                 addr = (void *)virt_to_phys(addr);
2525                 size += aligned_size((u64)addr, sg->length);
2526         }
2527
2528         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2529         if (!iova) {
2530                 sglist->dma_length = 0;
2531                 return 0;
2532         }
2533
2534         /*
2535          * Check if DMAR supports zero-length reads on write only
2536          * mappings..
2537          */
2538         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2539                         !cap_zlr(iommu->cap))
2540                 prot |= DMA_PTE_READ;
2541         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2542                 prot |= DMA_PTE_WRITE;
2543
2544         start_addr = iova->pfn_lo << PAGE_SHIFT;
2545         offset = 0;
2546         for_each_sg(sglist, sg, nelems, i) {
2547                 addr = SG_ENT_VIRT_ADDRESS(sg);
2548                 addr = (void *)virt_to_phys(addr);
2549                 size = aligned_size((u64)addr, sg->length);
2550                 ret = domain_page_mapping(domain, start_addr + offset,
2551                         ((u64)addr) & PAGE_MASK,
2552                         size, prot);
2553                 if (ret) {
2554                         /*  clear the page */
2555                         dma_pte_clear_range(domain, start_addr,
2556                                   start_addr + offset);
2557                         /* free page tables */
2558                         dma_pte_free_pagetable(domain, start_addr,
2559                                   start_addr + offset);
2560                         /* free iova */
2561                         __free_iova(&domain->iovad, iova);
2562                         return 0;
2563                 }
2564                 sg->dma_address = start_addr + offset +
2565                                 ((u64)addr & (~PAGE_MASK));
2566                 sg->dma_length = sg->length;
2567                 offset += size;
2568         }
2569
2570         /* it's a non-present to present mapping */
2571         if (iommu_flush_iotlb_psi(iommu, domain->id,
2572                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2573                 iommu_flush_write_buffer(iommu);
2574         return nelems;
2575 }
2576
2577 static struct dma_mapping_ops intel_dma_ops = {
2578         .alloc_coherent = intel_alloc_coherent,
2579         .free_coherent = intel_free_coherent,
2580         .map_single = intel_map_single,
2581         .unmap_single = intel_unmap_single,
2582         .map_sg = intel_map_sg,
2583         .unmap_sg = intel_unmap_sg,
2584 };
2585
2586 static inline int iommu_domain_cache_init(void)
2587 {
2588         int ret = 0;
2589
2590         iommu_domain_cache = kmem_cache_create("iommu_domain",
2591                                          sizeof(struct dmar_domain),
2592                                          0,
2593                                          SLAB_HWCACHE_ALIGN,
2594
2595                                          NULL);
2596         if (!iommu_domain_cache) {
2597                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2598                 ret = -ENOMEM;
2599         }
2600
2601         return ret;
2602 }
2603
2604 static inline int iommu_devinfo_cache_init(void)
2605 {
2606         int ret = 0;
2607
2608         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2609                                          sizeof(struct device_domain_info),
2610                                          0,
2611                                          SLAB_HWCACHE_ALIGN,
2612                                          NULL);
2613         if (!iommu_devinfo_cache) {
2614                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2615                 ret = -ENOMEM;
2616         }
2617
2618         return ret;
2619 }
2620
2621 static inline int iommu_iova_cache_init(void)
2622 {
2623         int ret = 0;
2624
2625         iommu_iova_cache = kmem_cache_create("iommu_iova",
2626                                          sizeof(struct iova),
2627                                          0,
2628                                          SLAB_HWCACHE_ALIGN,
2629                                          NULL);
2630         if (!iommu_iova_cache) {
2631                 printk(KERN_ERR "Couldn't create iova cache\n");
2632                 ret = -ENOMEM;
2633         }
2634
2635         return ret;
2636 }
2637
2638 static int __init iommu_init_mempool(void)
2639 {
2640         int ret;
2641         ret = iommu_iova_cache_init();
2642         if (ret)
2643                 return ret;
2644
2645         ret = iommu_domain_cache_init();
2646         if (ret)
2647                 goto domain_error;
2648
2649         ret = iommu_devinfo_cache_init();
2650         if (!ret)
2651                 return ret;
2652
2653         kmem_cache_destroy(iommu_domain_cache);
2654 domain_error:
2655         kmem_cache_destroy(iommu_iova_cache);
2656
2657         return -ENOMEM;
2658 }
2659
2660 static void __init iommu_exit_mempool(void)
2661 {
2662         kmem_cache_destroy(iommu_devinfo_cache);
2663         kmem_cache_destroy(iommu_domain_cache);
2664         kmem_cache_destroy(iommu_iova_cache);
2665
2666 }
2667
2668 static void __init init_no_remapping_devices(void)
2669 {
2670         struct dmar_drhd_unit *drhd;
2671
2672         for_each_drhd_unit(drhd) {
2673                 if (!drhd->include_all) {
2674                         int i;
2675                         for (i = 0; i < drhd->devices_cnt; i++)
2676                                 if (drhd->devices[i] != NULL)
2677                                         break;
2678                         /* ignore DMAR unit if no pci devices exist */
2679                         if (i == drhd->devices_cnt)
2680                                 drhd->ignored = 1;
2681                 }
2682         }
2683
2684         if (dmar_map_gfx)
2685                 return;
2686
2687         for_each_drhd_unit(drhd) {
2688                 int i;
2689                 if (drhd->ignored || drhd->include_all)
2690                         continue;
2691
2692                 for (i = 0; i < drhd->devices_cnt; i++)
2693                         if (drhd->devices[i] &&
2694                                 !IS_GFX_DEVICE(drhd->devices[i]))
2695                                 break;
2696
2697                 if (i < drhd->devices_cnt)
2698                         continue;
2699
2700                 /* bypass IOMMU if it is just for gfx devices */
2701                 drhd->ignored = 1;
2702                 for (i = 0; i < drhd->devices_cnt; i++) {
2703                         if (!drhd->devices[i])
2704                                 continue;
2705                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2706                 }
2707         }
2708 }
2709
2710 int __init intel_iommu_init(void)
2711 {
2712         int ret = 0;
2713
2714         if (dmar_table_init())
2715                 return  -ENODEV;
2716
2717         if (dmar_dev_scope_init())
2718                 return  -ENODEV;
2719
2720         /*
2721          * Check the need for DMA-remapping initialization now.
2722          * Above initialization will also be used by Interrupt-remapping.
2723          */
2724         if (no_iommu || swiotlb || dmar_disabled)
2725                 return -ENODEV;
2726
2727         iommu_init_mempool();
2728         dmar_init_reserved_ranges();
2729
2730         init_no_remapping_devices();
2731
2732         ret = init_dmars();
2733         if (ret) {
2734                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2735                 put_iova_domain(&reserved_iova_list);
2736                 iommu_exit_mempool();
2737                 return ret;
2738         }
2739         printk(KERN_INFO
2740         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2741
2742         init_timer(&unmap_timer);
2743         force_iommu = 1;
2744         dma_ops = &intel_dma_ops;
2745
2746         register_iommu(&intel_iommu_ops);
2747
2748         return 0;
2749 }
2750
2751 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2752                                   struct pci_dev *pdev)
2753 {
2754         struct device_domain_info *info;
2755         unsigned long flags;
2756
2757         info = alloc_devinfo_mem();
2758         if (!info)
2759                 return -ENOMEM;
2760
2761         info->bus = pdev->bus->number;
2762         info->devfn = pdev->devfn;
2763         info->dev = pdev;
2764         info->domain = domain;
2765
2766         spin_lock_irqsave(&device_domain_lock, flags);
2767         list_add(&info->link, &domain->devices);
2768         list_add(&info->global, &device_domain_list);
2769         pdev->dev.archdata.iommu = info;
2770         spin_unlock_irqrestore(&device_domain_lock, flags);
2771
2772         return 0;
2773 }
2774
2775 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2776                                           struct pci_dev *pdev)
2777 {
2778         struct device_domain_info *info;
2779         struct intel_iommu *iommu;
2780         unsigned long flags;
2781         int found = 0;
2782         struct list_head *entry, *tmp;
2783
2784         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2785         if (!iommu)
2786                 return;
2787
2788         spin_lock_irqsave(&device_domain_lock, flags);
2789         list_for_each_safe(entry, tmp, &domain->devices) {
2790                 info = list_entry(entry, struct device_domain_info, link);
2791                 if (info->bus == pdev->bus->number &&
2792                     info->devfn == pdev->devfn) {
2793                         list_del(&info->link);
2794                         list_del(&info->global);
2795                         if (info->dev)
2796                                 info->dev->dev.archdata.iommu = NULL;
2797                         spin_unlock_irqrestore(&device_domain_lock, flags);
2798
2799                         iommu_detach_dev(iommu, info->bus, info->devfn);
2800                         free_devinfo_mem(info);
2801
2802                         spin_lock_irqsave(&device_domain_lock, flags);
2803
2804                         if (found)
2805                                 break;
2806                         else
2807                                 continue;
2808                 }
2809
2810                 /* if there is no other devices under the same iommu
2811                  * owned by this domain, clear this iommu in iommu_bmp
2812                  * update iommu count and coherency
2813                  */
2814                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2815                         found = 1;
2816         }
2817
2818         if (found == 0) {
2819                 unsigned long tmp_flags;
2820                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2821                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2822                 domain->iommu_count--;
2823                 domain_update_iommu_coherency(domain);
2824                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2825         }
2826
2827         spin_unlock_irqrestore(&device_domain_lock, flags);
2828 }
2829
2830 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2831 {
2832         struct device_domain_info *info;
2833         struct intel_iommu *iommu;
2834         unsigned long flags1, flags2;
2835
2836         spin_lock_irqsave(&device_domain_lock, flags1);
2837         while (!list_empty(&domain->devices)) {
2838                 info = list_entry(domain->devices.next,
2839                         struct device_domain_info, link);
2840                 list_del(&info->link);
2841                 list_del(&info->global);
2842                 if (info->dev)
2843                         info->dev->dev.archdata.iommu = NULL;
2844
2845                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2846
2847                 iommu = device_to_iommu(info->bus, info->devfn);
2848                 iommu_detach_dev(iommu, info->bus, info->devfn);
2849
2850                 /* clear this iommu in iommu_bmp, update iommu count
2851                  * and coherency
2852                  */
2853                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2854                 if (test_and_clear_bit(iommu->seq_id,
2855                                        &domain->iommu_bmp)) {
2856                         domain->iommu_count--;
2857                         domain_update_iommu_coherency(domain);
2858                 }
2859                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2860
2861                 free_devinfo_mem(info);
2862                 spin_lock_irqsave(&device_domain_lock, flags1);
2863         }
2864         spin_unlock_irqrestore(&device_domain_lock, flags1);
2865 }
2866
2867 /* domain id for virtual machine, it won't be set in context */
2868 static unsigned long vm_domid;
2869
2870 static int vm_domain_min_agaw(struct dmar_domain *domain)
2871 {
2872         int i;
2873         int min_agaw = domain->agaw;
2874
2875         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2876         for (; i < g_num_of_iommus; ) {
2877                 if (min_agaw > g_iommus[i]->agaw)
2878                         min_agaw = g_iommus[i]->agaw;
2879
2880                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2881         }
2882
2883         return min_agaw;
2884 }
2885
2886 static struct dmar_domain *iommu_alloc_vm_domain(void)
2887 {
2888         struct dmar_domain *domain;
2889
2890         domain = alloc_domain_mem();
2891         if (!domain)
2892                 return NULL;
2893
2894         domain->id = vm_domid++;
2895         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2896         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2897
2898         return domain;
2899 }
2900
2901 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2902 {
2903         int adjust_width;
2904
2905         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2906         spin_lock_init(&domain->mapping_lock);
2907         spin_lock_init(&domain->iommu_lock);
2908
2909         domain_reserve_special_ranges(domain);
2910
2911         /* calculate AGAW */
2912         domain->gaw = guest_width;
2913         adjust_width = guestwidth_to_adjustwidth(guest_width);
2914         domain->agaw = width_to_agaw(adjust_width);
2915
2916         INIT_LIST_HEAD(&domain->devices);
2917
2918         domain->iommu_count = 0;
2919         domain->iommu_coherency = 0;
2920         domain->max_addr = 0;
2921
2922         /* always allocate the top pgd */
2923         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2924         if (!domain->pgd)
2925                 return -ENOMEM;
2926         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2927         return 0;
2928 }
2929
2930 static void iommu_free_vm_domain(struct dmar_domain *domain)
2931 {
2932         unsigned long flags;
2933         struct dmar_drhd_unit *drhd;
2934         struct intel_iommu *iommu;
2935         unsigned long i;
2936         unsigned long ndomains;
2937
2938         for_each_drhd_unit(drhd) {
2939                 if (drhd->ignored)
2940                         continue;
2941                 iommu = drhd->iommu;
2942
2943                 ndomains = cap_ndoms(iommu->cap);
2944                 i = find_first_bit(iommu->domain_ids, ndomains);
2945                 for (; i < ndomains; ) {
2946                         if (iommu->domains[i] == domain) {
2947                                 spin_lock_irqsave(&iommu->lock, flags);
2948                                 clear_bit(i, iommu->domain_ids);
2949                                 iommu->domains[i] = NULL;
2950                                 spin_unlock_irqrestore(&iommu->lock, flags);
2951                                 break;
2952                         }
2953                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2954                 }
2955         }
2956 }
2957
2958 static void vm_domain_exit(struct dmar_domain *domain)
2959 {
2960         u64 end;
2961
2962         /* Domain 0 is reserved, so dont process it */
2963         if (!domain)
2964                 return;
2965
2966         vm_domain_remove_all_dev_info(domain);
2967         /* destroy iovas */
2968         put_iova_domain(&domain->iovad);
2969         end = DOMAIN_MAX_ADDR(domain->gaw);
2970         end = end & (~VTD_PAGE_MASK);
2971
2972         /* clear ptes */
2973         dma_pte_clear_range(domain, 0, end);
2974
2975         /* free page tables */
2976         dma_pte_free_pagetable(domain, 0, end);
2977
2978         iommu_free_vm_domain(domain);
2979         free_domain_mem(domain);
2980 }
2981
2982 static int intel_iommu_domain_init(struct iommu_domain *domain)
2983 {
2984         struct dmar_domain *dmar_domain;
2985
2986         dmar_domain = iommu_alloc_vm_domain();
2987         if (!dmar_domain) {
2988                 printk(KERN_ERR
2989                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2990                 return -ENOMEM;
2991         }
2992         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2993                 printk(KERN_ERR
2994                         "intel_iommu_domain_init() failed\n");
2995                 vm_domain_exit(dmar_domain);
2996                 return -ENOMEM;
2997         }
2998         domain->priv = dmar_domain;
2999
3000         return 0;
3001 }
3002
3003 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
3004 {
3005         struct dmar_domain *dmar_domain = domain->priv;
3006
3007         domain->priv = NULL;
3008         vm_domain_exit(dmar_domain);
3009 }
3010
3011 static int intel_iommu_attach_device(struct iommu_domain *domain,
3012                                      struct device *dev)
3013 {
3014         struct dmar_domain *dmar_domain = domain->priv;
3015         struct pci_dev *pdev = to_pci_dev(dev);
3016         struct intel_iommu *iommu;
3017         int addr_width;
3018         u64 end;
3019         int ret;
3020
3021         /* normally pdev is not mapped */
3022         if (unlikely(domain_context_mapped(pdev))) {
3023                 struct dmar_domain *old_domain;
3024
3025                 old_domain = find_domain(pdev);
3026                 if (old_domain) {
3027                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3028                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3029                         else
3030                                 domain_remove_dev_info(old_domain);
3031                 }
3032         }
3033
3034         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3035         if (!iommu)
3036                 return -ENODEV;
3037
3038         /* check if this iommu agaw is sufficient for max mapped address */
3039         addr_width = agaw_to_width(iommu->agaw);
3040         end = DOMAIN_MAX_ADDR(addr_width);
3041         end = end & VTD_PAGE_MASK;
3042         if (end < dmar_domain->max_addr) {
3043                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3044                        "sufficient for the mapped address (%llx)\n",
3045                        __func__, iommu->agaw, dmar_domain->max_addr);
3046                 return -EFAULT;
3047         }
3048
3049         ret = domain_context_mapping(dmar_domain, pdev);
3050         if (ret)
3051                 return ret;
3052
3053         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3054         return ret;
3055 }
3056
3057 static void intel_iommu_detach_device(struct iommu_domain *domain,
3058                                       struct device *dev)
3059 {
3060         struct dmar_domain *dmar_domain = domain->priv;
3061         struct pci_dev *pdev = to_pci_dev(dev);
3062
3063         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3064 }
3065
3066 static int intel_iommu_map_range(struct iommu_domain *domain,
3067                                  unsigned long iova, phys_addr_t hpa,
3068                                  size_t size, int iommu_prot)
3069 {
3070         struct dmar_domain *dmar_domain = domain->priv;
3071         u64 max_addr;
3072         int addr_width;
3073         int prot = 0;
3074         int ret;
3075
3076         if (iommu_prot & IOMMU_READ)
3077                 prot |= DMA_PTE_READ;
3078         if (iommu_prot & IOMMU_WRITE)
3079                 prot |= DMA_PTE_WRITE;
3080
3081         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3082         if (dmar_domain->max_addr < max_addr) {
3083                 int min_agaw;
3084                 u64 end;
3085
3086                 /* check if minimum agaw is sufficient for mapped address */
3087                 min_agaw = vm_domain_min_agaw(dmar_domain);
3088                 addr_width = agaw_to_width(min_agaw);
3089                 end = DOMAIN_MAX_ADDR(addr_width);
3090                 end = end & VTD_PAGE_MASK;
3091                 if (end < max_addr) {
3092                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3093                                "sufficient for the mapped address (%llx)\n",
3094                                __func__, min_agaw, max_addr);
3095                         return -EFAULT;
3096                 }
3097                 dmar_domain->max_addr = max_addr;
3098         }
3099
3100         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3101         return ret;
3102 }
3103
3104 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3105                                     unsigned long iova, size_t size)
3106 {
3107         struct dmar_domain *dmar_domain = domain->priv;
3108         dma_addr_t base;
3109
3110         /* The address might not be aligned */
3111         base = iova & VTD_PAGE_MASK;
3112         size = VTD_PAGE_ALIGN(size);
3113         dma_pte_clear_range(dmar_domain, base, base + size);
3114
3115         if (dmar_domain->max_addr == base + size)
3116                 dmar_domain->max_addr = base;
3117 }
3118
3119 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3120                                             unsigned long iova)
3121 {
3122         struct dmar_domain *dmar_domain = domain->priv;
3123         struct dma_pte *pte;
3124         u64 phys = 0;
3125
3126         pte = addr_to_dma_pte(dmar_domain, iova);
3127         if (pte)
3128                 phys = dma_pte_addr(pte);
3129
3130         return phys;
3131 }
3132
3133 static struct iommu_ops intel_iommu_ops = {
3134         .domain_init    = intel_iommu_domain_init,
3135         .domain_destroy = intel_iommu_domain_destroy,
3136         .attach_dev     = intel_iommu_attach_device,
3137         .detach_dev     = intel_iommu_detach_device,
3138         .map            = intel_iommu_map_range,
3139         .unmap          = intel_iommu_unmap_range,
3140         .iova_to_phys   = intel_iommu_iova_to_phys,
3141 };
3142
3143 static void __devinit quirk_iommu_rwbf(struct pci_dev *dev)
3144 {
3145         /*
3146          * Mobile 4 Series Chipset neglects to set RWBF capability,
3147          * but needs it:
3148          */
3149         printk(KERN_INFO "DMAR: Forcing write-buffer flush capability\n");
3150         rwbf_quirk = 1;
3151 }
3152
3153 DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x2a40, quirk_iommu_rwbf);