Check agaw is sufficient for mapped memory
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/intel-iommu.h>
38 #include <asm/cacheflush.h>
39 #include <asm/iommu.h>
40 #include "pci.h"
41
42 #define ROOT_SIZE               VTD_PAGE_SIZE
43 #define CONTEXT_SIZE            VTD_PAGE_SIZE
44
45 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
46 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
47
48 #define IOAPIC_RANGE_START      (0xfee00000)
49 #define IOAPIC_RANGE_END        (0xfeefffff)
50 #define IOVA_START_ADDR         (0x1000)
51
52 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
53
54 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
55
56 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
57 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
58 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
59
60 /* global iommu list, set NULL for ignored DMAR units */
61 static struct intel_iommu **g_iommus;
62
63 /*
64  * 0: Present
65  * 1-11: Reserved
66  * 12-63: Context Ptr (12 - (haw-1))
67  * 64-127: Reserved
68  */
69 struct root_entry {
70         u64     val;
71         u64     rsvd1;
72 };
73 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
74 static inline bool root_present(struct root_entry *root)
75 {
76         return (root->val & 1);
77 }
78 static inline void set_root_present(struct root_entry *root)
79 {
80         root->val |= 1;
81 }
82 static inline void set_root_value(struct root_entry *root, unsigned long value)
83 {
84         root->val |= value & VTD_PAGE_MASK;
85 }
86
87 static inline struct context_entry *
88 get_context_addr_from_root(struct root_entry *root)
89 {
90         return (struct context_entry *)
91                 (root_present(root)?phys_to_virt(
92                 root->val & VTD_PAGE_MASK) :
93                 NULL);
94 }
95
96 /*
97  * low 64 bits:
98  * 0: present
99  * 1: fault processing disable
100  * 2-3: translation type
101  * 12-63: address space root
102  * high 64 bits:
103  * 0-2: address width
104  * 3-6: aval
105  * 8-23: domain id
106  */
107 struct context_entry {
108         u64 lo;
109         u64 hi;
110 };
111
112 static inline bool context_present(struct context_entry *context)
113 {
114         return (context->lo & 1);
115 }
116 static inline void context_set_present(struct context_entry *context)
117 {
118         context->lo |= 1;
119 }
120
121 static inline void context_set_fault_enable(struct context_entry *context)
122 {
123         context->lo &= (((u64)-1) << 2) | 1;
124 }
125
126 #define CONTEXT_TT_MULTI_LEVEL 0
127
128 static inline void context_set_translation_type(struct context_entry *context,
129                                                 unsigned long value)
130 {
131         context->lo &= (((u64)-1) << 4) | 3;
132         context->lo |= (value & 3) << 2;
133 }
134
135 static inline void context_set_address_root(struct context_entry *context,
136                                             unsigned long value)
137 {
138         context->lo |= value & VTD_PAGE_MASK;
139 }
140
141 static inline void context_set_address_width(struct context_entry *context,
142                                              unsigned long value)
143 {
144         context->hi |= value & 7;
145 }
146
147 static inline void context_set_domain_id(struct context_entry *context,
148                                          unsigned long value)
149 {
150         context->hi |= (value & ((1 << 16) - 1)) << 8;
151 }
152
153 static inline void context_clear_entry(struct context_entry *context)
154 {
155         context->lo = 0;
156         context->hi = 0;
157 }
158
159 /*
160  * 0: readable
161  * 1: writable
162  * 2-6: reserved
163  * 7: super page
164  * 8-11: available
165  * 12-63: Host physcial address
166  */
167 struct dma_pte {
168         u64 val;
169 };
170
171 static inline void dma_clear_pte(struct dma_pte *pte)
172 {
173         pte->val = 0;
174 }
175
176 static inline void dma_set_pte_readable(struct dma_pte *pte)
177 {
178         pte->val |= DMA_PTE_READ;
179 }
180
181 static inline void dma_set_pte_writable(struct dma_pte *pte)
182 {
183         pte->val |= DMA_PTE_WRITE;
184 }
185
186 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
187 {
188         pte->val = (pte->val & ~3) | (prot & 3);
189 }
190
191 static inline u64 dma_pte_addr(struct dma_pte *pte)
192 {
193         return (pte->val & VTD_PAGE_MASK);
194 }
195
196 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
197 {
198         pte->val |= (addr & VTD_PAGE_MASK);
199 }
200
201 static inline bool dma_pte_present(struct dma_pte *pte)
202 {
203         return (pte->val & 3) != 0;
204 }
205
206 /* devices under the same p2p bridge are owned in one domain */
207 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 < 0)
208
209 /* domain represents a virtual machine, more than one devices
210  * across iommus may be owned in one domain, e.g. kvm guest.
211  */
212 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
213
214 struct dmar_domain {
215         int     id;                     /* domain id */
216         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
217
218         struct list_head devices;       /* all devices' list */
219         struct iova_domain iovad;       /* iova's that belong to this domain */
220
221         struct dma_pte  *pgd;           /* virtual address */
222         spinlock_t      mapping_lock;   /* page table lock */
223         int             gaw;            /* max guest address width */
224
225         /* adjusted guest address width, 0 is level 2 30-bit */
226         int             agaw;
227
228         int             flags;          /* flags to find out type of domain */
229
230         int             iommu_coherency;/* indicate coherency of iommu access */
231         int             iommu_count;    /* reference count of iommu */
232         spinlock_t      iommu_lock;     /* protect iommu set in domain */
233         u64             max_addr;       /* maximum mapped address */
234 };
235
236 /* PCI domain-device relationship */
237 struct device_domain_info {
238         struct list_head link;  /* link to domain siblings */
239         struct list_head global; /* link to global list */
240         u8 bus;                 /* PCI bus numer */
241         u8 devfn;               /* PCI devfn number */
242         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
243         struct dmar_domain *domain; /* pointer to domain */
244 };
245
246 static void flush_unmaps_timeout(unsigned long data);
247
248 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
249
250 #define HIGH_WATER_MARK 250
251 struct deferred_flush_tables {
252         int next;
253         struct iova *iova[HIGH_WATER_MARK];
254         struct dmar_domain *domain[HIGH_WATER_MARK];
255 };
256
257 static struct deferred_flush_tables *deferred_flush;
258
259 /* bitmap for indexing intel_iommus */
260 static int g_num_of_iommus;
261
262 static DEFINE_SPINLOCK(async_umap_flush_lock);
263 static LIST_HEAD(unmaps_to_do);
264
265 static int timer_on;
266 static long list_size;
267
268 static void domain_remove_dev_info(struct dmar_domain *domain);
269
270 int dmar_disabled;
271 static int __initdata dmar_map_gfx = 1;
272 static int dmar_forcedac;
273 static int intel_iommu_strict;
274
275 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
276 static DEFINE_SPINLOCK(device_domain_lock);
277 static LIST_HEAD(device_domain_list);
278
279 static int __init intel_iommu_setup(char *str)
280 {
281         if (!str)
282                 return -EINVAL;
283         while (*str) {
284                 if (!strncmp(str, "off", 3)) {
285                         dmar_disabled = 1;
286                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
287                 } else if (!strncmp(str, "igfx_off", 8)) {
288                         dmar_map_gfx = 0;
289                         printk(KERN_INFO
290                                 "Intel-IOMMU: disable GFX device mapping\n");
291                 } else if (!strncmp(str, "forcedac", 8)) {
292                         printk(KERN_INFO
293                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
294                         dmar_forcedac = 1;
295                 } else if (!strncmp(str, "strict", 6)) {
296                         printk(KERN_INFO
297                                 "Intel-IOMMU: disable batched IOTLB flush\n");
298                         intel_iommu_strict = 1;
299                 }
300
301                 str += strcspn(str, ",");
302                 while (*str == ',')
303                         str++;
304         }
305         return 0;
306 }
307 __setup("intel_iommu=", intel_iommu_setup);
308
309 static struct kmem_cache *iommu_domain_cache;
310 static struct kmem_cache *iommu_devinfo_cache;
311 static struct kmem_cache *iommu_iova_cache;
312
313 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
314 {
315         unsigned int flags;
316         void *vaddr;
317
318         /* trying to avoid low memory issues */
319         flags = current->flags & PF_MEMALLOC;
320         current->flags |= PF_MEMALLOC;
321         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
322         current->flags &= (~PF_MEMALLOC | flags);
323         return vaddr;
324 }
325
326
327 static inline void *alloc_pgtable_page(void)
328 {
329         unsigned int flags;
330         void *vaddr;
331
332         /* trying to avoid low memory issues */
333         flags = current->flags & PF_MEMALLOC;
334         current->flags |= PF_MEMALLOC;
335         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
336         current->flags &= (~PF_MEMALLOC | flags);
337         return vaddr;
338 }
339
340 static inline void free_pgtable_page(void *vaddr)
341 {
342         free_page((unsigned long)vaddr);
343 }
344
345 static inline void *alloc_domain_mem(void)
346 {
347         return iommu_kmem_cache_alloc(iommu_domain_cache);
348 }
349
350 static void free_domain_mem(void *vaddr)
351 {
352         kmem_cache_free(iommu_domain_cache, vaddr);
353 }
354
355 static inline void * alloc_devinfo_mem(void)
356 {
357         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
358 }
359
360 static inline void free_devinfo_mem(void *vaddr)
361 {
362         kmem_cache_free(iommu_devinfo_cache, vaddr);
363 }
364
365 struct iova *alloc_iova_mem(void)
366 {
367         return iommu_kmem_cache_alloc(iommu_iova_cache);
368 }
369
370 void free_iova_mem(struct iova *iova)
371 {
372         kmem_cache_free(iommu_iova_cache, iova);
373 }
374
375
376 static inline int width_to_agaw(int width);
377
378 /* calculate agaw for each iommu.
379  * "SAGAW" may be different across iommus, use a default agaw, and
380  * get a supported less agaw for iommus that don't support the default agaw.
381  */
382 int iommu_calculate_agaw(struct intel_iommu *iommu)
383 {
384         unsigned long sagaw;
385         int agaw = -1;
386
387         sagaw = cap_sagaw(iommu->cap);
388         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
389              agaw >= 0; agaw--) {
390                 if (test_bit(agaw, &sagaw))
391                         break;
392         }
393
394         return agaw;
395 }
396
397 /* in native case, each domain is related to only one iommu */
398 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
399 {
400         int iommu_id;
401
402         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
403
404         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
405         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
406                 return NULL;
407
408         return g_iommus[iommu_id];
409 }
410
411 /* "Coherency" capability may be different across iommus */
412 static void domain_update_iommu_coherency(struct dmar_domain *domain)
413 {
414         int i;
415
416         domain->iommu_coherency = 1;
417
418         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
419         for (; i < g_num_of_iommus; ) {
420                 if (!ecap_coherent(g_iommus[i]->ecap)) {
421                         domain->iommu_coherency = 0;
422                         break;
423                 }
424                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
425         }
426 }
427
428 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
429 {
430         struct dmar_drhd_unit *drhd = NULL;
431         int i;
432
433         for_each_drhd_unit(drhd) {
434                 if (drhd->ignored)
435                         continue;
436
437                 for (i = 0; i < drhd->devices_cnt; i++)
438                         if (drhd->devices[i]->bus->number == bus &&
439                             drhd->devices[i]->devfn == devfn)
440                                 return drhd->iommu;
441
442                 if (drhd->include_all)
443                         return drhd->iommu;
444         }
445
446         return NULL;
447 }
448
449 static void domain_flush_cache(struct dmar_domain *domain,
450                                void *addr, int size)
451 {
452         if (!domain->iommu_coherency)
453                 clflush_cache_range(addr, size);
454 }
455
456 /* Gets context entry for a given bus and devfn */
457 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
458                 u8 bus, u8 devfn)
459 {
460         struct root_entry *root;
461         struct context_entry *context;
462         unsigned long phy_addr;
463         unsigned long flags;
464
465         spin_lock_irqsave(&iommu->lock, flags);
466         root = &iommu->root_entry[bus];
467         context = get_context_addr_from_root(root);
468         if (!context) {
469                 context = (struct context_entry *)alloc_pgtable_page();
470                 if (!context) {
471                         spin_unlock_irqrestore(&iommu->lock, flags);
472                         return NULL;
473                 }
474                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
475                 phy_addr = virt_to_phys((void *)context);
476                 set_root_value(root, phy_addr);
477                 set_root_present(root);
478                 __iommu_flush_cache(iommu, root, sizeof(*root));
479         }
480         spin_unlock_irqrestore(&iommu->lock, flags);
481         return &context[devfn];
482 }
483
484 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
485 {
486         struct root_entry *root;
487         struct context_entry *context;
488         int ret;
489         unsigned long flags;
490
491         spin_lock_irqsave(&iommu->lock, flags);
492         root = &iommu->root_entry[bus];
493         context = get_context_addr_from_root(root);
494         if (!context) {
495                 ret = 0;
496                 goto out;
497         }
498         ret = context_present(&context[devfn]);
499 out:
500         spin_unlock_irqrestore(&iommu->lock, flags);
501         return ret;
502 }
503
504 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
505 {
506         struct root_entry *root;
507         struct context_entry *context;
508         unsigned long flags;
509
510         spin_lock_irqsave(&iommu->lock, flags);
511         root = &iommu->root_entry[bus];
512         context = get_context_addr_from_root(root);
513         if (context) {
514                 context_clear_entry(&context[devfn]);
515                 __iommu_flush_cache(iommu, &context[devfn], \
516                         sizeof(*context));
517         }
518         spin_unlock_irqrestore(&iommu->lock, flags);
519 }
520
521 static void free_context_table(struct intel_iommu *iommu)
522 {
523         struct root_entry *root;
524         int i;
525         unsigned long flags;
526         struct context_entry *context;
527
528         spin_lock_irqsave(&iommu->lock, flags);
529         if (!iommu->root_entry) {
530                 goto out;
531         }
532         for (i = 0; i < ROOT_ENTRY_NR; i++) {
533                 root = &iommu->root_entry[i];
534                 context = get_context_addr_from_root(root);
535                 if (context)
536                         free_pgtable_page(context);
537         }
538         free_pgtable_page(iommu->root_entry);
539         iommu->root_entry = NULL;
540 out:
541         spin_unlock_irqrestore(&iommu->lock, flags);
542 }
543
544 /* page table handling */
545 #define LEVEL_STRIDE            (9)
546 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
547
548 static inline int agaw_to_level(int agaw)
549 {
550         return agaw + 2;
551 }
552
553 static inline int agaw_to_width(int agaw)
554 {
555         return 30 + agaw * LEVEL_STRIDE;
556
557 }
558
559 static inline int width_to_agaw(int width)
560 {
561         return (width - 30) / LEVEL_STRIDE;
562 }
563
564 static inline unsigned int level_to_offset_bits(int level)
565 {
566         return (12 + (level - 1) * LEVEL_STRIDE);
567 }
568
569 static inline int address_level_offset(u64 addr, int level)
570 {
571         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
572 }
573
574 static inline u64 level_mask(int level)
575 {
576         return ((u64)-1 << level_to_offset_bits(level));
577 }
578
579 static inline u64 level_size(int level)
580 {
581         return ((u64)1 << level_to_offset_bits(level));
582 }
583
584 static inline u64 align_to_level(u64 addr, int level)
585 {
586         return ((addr + level_size(level) - 1) & level_mask(level));
587 }
588
589 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
590 {
591         int addr_width = agaw_to_width(domain->agaw);
592         struct dma_pte *parent, *pte = NULL;
593         int level = agaw_to_level(domain->agaw);
594         int offset;
595         unsigned long flags;
596
597         BUG_ON(!domain->pgd);
598
599         addr &= (((u64)1) << addr_width) - 1;
600         parent = domain->pgd;
601
602         spin_lock_irqsave(&domain->mapping_lock, flags);
603         while (level > 0) {
604                 void *tmp_page;
605
606                 offset = address_level_offset(addr, level);
607                 pte = &parent[offset];
608                 if (level == 1)
609                         break;
610
611                 if (!dma_pte_present(pte)) {
612                         tmp_page = alloc_pgtable_page();
613
614                         if (!tmp_page) {
615                                 spin_unlock_irqrestore(&domain->mapping_lock,
616                                         flags);
617                                 return NULL;
618                         }
619                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
620                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
621                         /*
622                          * high level table always sets r/w, last level page
623                          * table control read/write
624                          */
625                         dma_set_pte_readable(pte);
626                         dma_set_pte_writable(pte);
627                         domain_flush_cache(domain, pte, sizeof(*pte));
628                 }
629                 parent = phys_to_virt(dma_pte_addr(pte));
630                 level--;
631         }
632
633         spin_unlock_irqrestore(&domain->mapping_lock, flags);
634         return pte;
635 }
636
637 /* return address's pte at specific level */
638 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
639                 int level)
640 {
641         struct dma_pte *parent, *pte = NULL;
642         int total = agaw_to_level(domain->agaw);
643         int offset;
644
645         parent = domain->pgd;
646         while (level <= total) {
647                 offset = address_level_offset(addr, total);
648                 pte = &parent[offset];
649                 if (level == total)
650                         return pte;
651
652                 if (!dma_pte_present(pte))
653                         break;
654                 parent = phys_to_virt(dma_pte_addr(pte));
655                 total--;
656         }
657         return NULL;
658 }
659
660 /* clear one page's page table */
661 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
662 {
663         struct dma_pte *pte = NULL;
664
665         /* get last level pte */
666         pte = dma_addr_level_pte(domain, addr, 1);
667
668         if (pte) {
669                 dma_clear_pte(pte);
670                 domain_flush_cache(domain, pte, sizeof(*pte));
671         }
672 }
673
674 /* clear last level pte, a tlb flush should be followed */
675 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
676 {
677         int addr_width = agaw_to_width(domain->agaw);
678
679         start &= (((u64)1) << addr_width) - 1;
680         end &= (((u64)1) << addr_width) - 1;
681         /* in case it's partial page */
682         start = PAGE_ALIGN(start);
683         end &= PAGE_MASK;
684
685         /* we don't need lock here, nobody else touches the iova range */
686         while (start < end) {
687                 dma_pte_clear_one(domain, start);
688                 start += VTD_PAGE_SIZE;
689         }
690 }
691
692 /* free page table pages. last level pte should already be cleared */
693 static void dma_pte_free_pagetable(struct dmar_domain *domain,
694         u64 start, u64 end)
695 {
696         int addr_width = agaw_to_width(domain->agaw);
697         struct dma_pte *pte;
698         int total = agaw_to_level(domain->agaw);
699         int level;
700         u64 tmp;
701
702         start &= (((u64)1) << addr_width) - 1;
703         end &= (((u64)1) << addr_width) - 1;
704
705         /* we don't need lock here, nobody else touches the iova range */
706         level = 2;
707         while (level <= total) {
708                 tmp = align_to_level(start, level);
709                 if (tmp >= end || (tmp + level_size(level) > end))
710                         return;
711
712                 while (tmp < end) {
713                         pte = dma_addr_level_pte(domain, tmp, level);
714                         if (pte) {
715                                 free_pgtable_page(
716                                         phys_to_virt(dma_pte_addr(pte)));
717                                 dma_clear_pte(pte);
718                                 domain_flush_cache(domain, pte, sizeof(*pte));
719                         }
720                         tmp += level_size(level);
721                 }
722                 level++;
723         }
724         /* free pgd */
725         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
726                 free_pgtable_page(domain->pgd);
727                 domain->pgd = NULL;
728         }
729 }
730
731 /* iommu handling */
732 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
733 {
734         struct root_entry *root;
735         unsigned long flags;
736
737         root = (struct root_entry *)alloc_pgtable_page();
738         if (!root)
739                 return -ENOMEM;
740
741         __iommu_flush_cache(iommu, root, ROOT_SIZE);
742
743         spin_lock_irqsave(&iommu->lock, flags);
744         iommu->root_entry = root;
745         spin_unlock_irqrestore(&iommu->lock, flags);
746
747         return 0;
748 }
749
750 static void iommu_set_root_entry(struct intel_iommu *iommu)
751 {
752         void *addr;
753         u32 cmd, sts;
754         unsigned long flag;
755
756         addr = iommu->root_entry;
757
758         spin_lock_irqsave(&iommu->register_lock, flag);
759         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
760
761         cmd = iommu->gcmd | DMA_GCMD_SRTP;
762         writel(cmd, iommu->reg + DMAR_GCMD_REG);
763
764         /* Make sure hardware complete it */
765         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
766                 readl, (sts & DMA_GSTS_RTPS), sts);
767
768         spin_unlock_irqrestore(&iommu->register_lock, flag);
769 }
770
771 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
772 {
773         u32 val;
774         unsigned long flag;
775
776         if (!cap_rwbf(iommu->cap))
777                 return;
778         val = iommu->gcmd | DMA_GCMD_WBF;
779
780         spin_lock_irqsave(&iommu->register_lock, flag);
781         writel(val, iommu->reg + DMAR_GCMD_REG);
782
783         /* Make sure hardware complete it */
784         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
785                         readl, (!(val & DMA_GSTS_WBFS)), val);
786
787         spin_unlock_irqrestore(&iommu->register_lock, flag);
788 }
789
790 /* return value determine if we need a write buffer flush */
791 static int __iommu_flush_context(struct intel_iommu *iommu,
792         u16 did, u16 source_id, u8 function_mask, u64 type,
793         int non_present_entry_flush)
794 {
795         u64 val = 0;
796         unsigned long flag;
797
798         /*
799          * In the non-present entry flush case, if hardware doesn't cache
800          * non-present entry we do nothing and if hardware cache non-present
801          * entry, we flush entries of domain 0 (the domain id is used to cache
802          * any non-present entries)
803          */
804         if (non_present_entry_flush) {
805                 if (!cap_caching_mode(iommu->cap))
806                         return 1;
807                 else
808                         did = 0;
809         }
810
811         switch (type) {
812         case DMA_CCMD_GLOBAL_INVL:
813                 val = DMA_CCMD_GLOBAL_INVL;
814                 break;
815         case DMA_CCMD_DOMAIN_INVL:
816                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
817                 break;
818         case DMA_CCMD_DEVICE_INVL:
819                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
820                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
821                 break;
822         default:
823                 BUG();
824         }
825         val |= DMA_CCMD_ICC;
826
827         spin_lock_irqsave(&iommu->register_lock, flag);
828         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
829
830         /* Make sure hardware complete it */
831         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
832                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
833
834         spin_unlock_irqrestore(&iommu->register_lock, flag);
835
836         /* flush context entry will implicitly flush write buffer */
837         return 0;
838 }
839
840 /* return value determine if we need a write buffer flush */
841 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
842         u64 addr, unsigned int size_order, u64 type,
843         int non_present_entry_flush)
844 {
845         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
846         u64 val = 0, val_iva = 0;
847         unsigned long flag;
848
849         /*
850          * In the non-present entry flush case, if hardware doesn't cache
851          * non-present entry we do nothing and if hardware cache non-present
852          * entry, we flush entries of domain 0 (the domain id is used to cache
853          * any non-present entries)
854          */
855         if (non_present_entry_flush) {
856                 if (!cap_caching_mode(iommu->cap))
857                         return 1;
858                 else
859                         did = 0;
860         }
861
862         switch (type) {
863         case DMA_TLB_GLOBAL_FLUSH:
864                 /* global flush doesn't need set IVA_REG */
865                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
866                 break;
867         case DMA_TLB_DSI_FLUSH:
868                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
869                 break;
870         case DMA_TLB_PSI_FLUSH:
871                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
872                 /* Note: always flush non-leaf currently */
873                 val_iva = size_order | addr;
874                 break;
875         default:
876                 BUG();
877         }
878         /* Note: set drain read/write */
879 #if 0
880         /*
881          * This is probably to be super secure.. Looks like we can
882          * ignore it without any impact.
883          */
884         if (cap_read_drain(iommu->cap))
885                 val |= DMA_TLB_READ_DRAIN;
886 #endif
887         if (cap_write_drain(iommu->cap))
888                 val |= DMA_TLB_WRITE_DRAIN;
889
890         spin_lock_irqsave(&iommu->register_lock, flag);
891         /* Note: Only uses first TLB reg currently */
892         if (val_iva)
893                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
894         dmar_writeq(iommu->reg + tlb_offset + 8, val);
895
896         /* Make sure hardware complete it */
897         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
898                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
899
900         spin_unlock_irqrestore(&iommu->register_lock, flag);
901
902         /* check IOTLB invalidation granularity */
903         if (DMA_TLB_IAIG(val) == 0)
904                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
905         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
906                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
907                         (unsigned long long)DMA_TLB_IIRG(type),
908                         (unsigned long long)DMA_TLB_IAIG(val));
909         /* flush iotlb entry will implicitly flush write buffer */
910         return 0;
911 }
912
913 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
914         u64 addr, unsigned int pages, int non_present_entry_flush)
915 {
916         unsigned int mask;
917
918         BUG_ON(addr & (~VTD_PAGE_MASK));
919         BUG_ON(pages == 0);
920
921         /* Fallback to domain selective flush if no PSI support */
922         if (!cap_pgsel_inv(iommu->cap))
923                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
924                                                 DMA_TLB_DSI_FLUSH,
925                                                 non_present_entry_flush);
926
927         /*
928          * PSI requires page size to be 2 ^ x, and the base address is naturally
929          * aligned to the size
930          */
931         mask = ilog2(__roundup_pow_of_two(pages));
932         /* Fallback to domain selective flush if size is too big */
933         if (mask > cap_max_amask_val(iommu->cap))
934                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
935                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
936
937         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
938                                         DMA_TLB_PSI_FLUSH,
939                                         non_present_entry_flush);
940 }
941
942 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
943 {
944         u32 pmen;
945         unsigned long flags;
946
947         spin_lock_irqsave(&iommu->register_lock, flags);
948         pmen = readl(iommu->reg + DMAR_PMEN_REG);
949         pmen &= ~DMA_PMEN_EPM;
950         writel(pmen, iommu->reg + DMAR_PMEN_REG);
951
952         /* wait for the protected region status bit to clear */
953         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
954                 readl, !(pmen & DMA_PMEN_PRS), pmen);
955
956         spin_unlock_irqrestore(&iommu->register_lock, flags);
957 }
958
959 static int iommu_enable_translation(struct intel_iommu *iommu)
960 {
961         u32 sts;
962         unsigned long flags;
963
964         spin_lock_irqsave(&iommu->register_lock, flags);
965         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
966
967         /* Make sure hardware complete it */
968         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
969                 readl, (sts & DMA_GSTS_TES), sts);
970
971         iommu->gcmd |= DMA_GCMD_TE;
972         spin_unlock_irqrestore(&iommu->register_lock, flags);
973         return 0;
974 }
975
976 static int iommu_disable_translation(struct intel_iommu *iommu)
977 {
978         u32 sts;
979         unsigned long flag;
980
981         spin_lock_irqsave(&iommu->register_lock, flag);
982         iommu->gcmd &= ~DMA_GCMD_TE;
983         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
984
985         /* Make sure hardware complete it */
986         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
987                 readl, (!(sts & DMA_GSTS_TES)), sts);
988
989         spin_unlock_irqrestore(&iommu->register_lock, flag);
990         return 0;
991 }
992
993 /* iommu interrupt handling. Most stuff are MSI-like. */
994
995 static const char *fault_reason_strings[] =
996 {
997         "Software",
998         "Present bit in root entry is clear",
999         "Present bit in context entry is clear",
1000         "Invalid context entry",
1001         "Access beyond MGAW",
1002         "PTE Write access is not set",
1003         "PTE Read access is not set",
1004         "Next page table ptr is invalid",
1005         "Root table address invalid",
1006         "Context table ptr is invalid",
1007         "non-zero reserved fields in RTP",
1008         "non-zero reserved fields in CTP",
1009         "non-zero reserved fields in PTE",
1010 };
1011 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1012
1013 const char *dmar_get_fault_reason(u8 fault_reason)
1014 {
1015         if (fault_reason > MAX_FAULT_REASON_IDX)
1016                 return "Unknown";
1017         else
1018                 return fault_reason_strings[fault_reason];
1019 }
1020
1021 void dmar_msi_unmask(unsigned int irq)
1022 {
1023         struct intel_iommu *iommu = get_irq_data(irq);
1024         unsigned long flag;
1025
1026         /* unmask it */
1027         spin_lock_irqsave(&iommu->register_lock, flag);
1028         writel(0, iommu->reg + DMAR_FECTL_REG);
1029         /* Read a reg to force flush the post write */
1030         readl(iommu->reg + DMAR_FECTL_REG);
1031         spin_unlock_irqrestore(&iommu->register_lock, flag);
1032 }
1033
1034 void dmar_msi_mask(unsigned int irq)
1035 {
1036         unsigned long flag;
1037         struct intel_iommu *iommu = get_irq_data(irq);
1038
1039         /* mask it */
1040         spin_lock_irqsave(&iommu->register_lock, flag);
1041         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1042         /* Read a reg to force flush the post write */
1043         readl(iommu->reg + DMAR_FECTL_REG);
1044         spin_unlock_irqrestore(&iommu->register_lock, flag);
1045 }
1046
1047 void dmar_msi_write(int irq, struct msi_msg *msg)
1048 {
1049         struct intel_iommu *iommu = get_irq_data(irq);
1050         unsigned long flag;
1051
1052         spin_lock_irqsave(&iommu->register_lock, flag);
1053         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1054         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1055         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1056         spin_unlock_irqrestore(&iommu->register_lock, flag);
1057 }
1058
1059 void dmar_msi_read(int irq, struct msi_msg *msg)
1060 {
1061         struct intel_iommu *iommu = get_irq_data(irq);
1062         unsigned long flag;
1063
1064         spin_lock_irqsave(&iommu->register_lock, flag);
1065         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1066         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1067         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1068         spin_unlock_irqrestore(&iommu->register_lock, flag);
1069 }
1070
1071 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1072                 u8 fault_reason, u16 source_id, unsigned long long addr)
1073 {
1074         const char *reason;
1075
1076         reason = dmar_get_fault_reason(fault_reason);
1077
1078         printk(KERN_ERR
1079                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1080                 "fault addr %llx \n"
1081                 "DMAR:[fault reason %02d] %s\n",
1082                 (type ? "DMA Read" : "DMA Write"),
1083                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1084                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1085         return 0;
1086 }
1087
1088 #define PRIMARY_FAULT_REG_LEN (16)
1089 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1090 {
1091         struct intel_iommu *iommu = dev_id;
1092         int reg, fault_index;
1093         u32 fault_status;
1094         unsigned long flag;
1095
1096         spin_lock_irqsave(&iommu->register_lock, flag);
1097         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1098
1099         /* TBD: ignore advanced fault log currently */
1100         if (!(fault_status & DMA_FSTS_PPF))
1101                 goto clear_overflow;
1102
1103         fault_index = dma_fsts_fault_record_index(fault_status);
1104         reg = cap_fault_reg_offset(iommu->cap);
1105         while (1) {
1106                 u8 fault_reason;
1107                 u16 source_id;
1108                 u64 guest_addr;
1109                 int type;
1110                 u32 data;
1111
1112                 /* highest 32 bits */
1113                 data = readl(iommu->reg + reg +
1114                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1115                 if (!(data & DMA_FRCD_F))
1116                         break;
1117
1118                 fault_reason = dma_frcd_fault_reason(data);
1119                 type = dma_frcd_type(data);
1120
1121                 data = readl(iommu->reg + reg +
1122                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1123                 source_id = dma_frcd_source_id(data);
1124
1125                 guest_addr = dmar_readq(iommu->reg + reg +
1126                                 fault_index * PRIMARY_FAULT_REG_LEN);
1127                 guest_addr = dma_frcd_page_addr(guest_addr);
1128                 /* clear the fault */
1129                 writel(DMA_FRCD_F, iommu->reg + reg +
1130                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1131
1132                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1133
1134                 iommu_page_fault_do_one(iommu, type, fault_reason,
1135                                 source_id, guest_addr);
1136
1137                 fault_index++;
1138                 if (fault_index > cap_num_fault_regs(iommu->cap))
1139                         fault_index = 0;
1140                 spin_lock_irqsave(&iommu->register_lock, flag);
1141         }
1142 clear_overflow:
1143         /* clear primary fault overflow */
1144         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1145         if (fault_status & DMA_FSTS_PFO)
1146                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1147
1148         spin_unlock_irqrestore(&iommu->register_lock, flag);
1149         return IRQ_HANDLED;
1150 }
1151
1152 int dmar_set_interrupt(struct intel_iommu *iommu)
1153 {
1154         int irq, ret;
1155
1156         irq = create_irq();
1157         if (!irq) {
1158                 printk(KERN_ERR "IOMMU: no free vectors\n");
1159                 return -EINVAL;
1160         }
1161
1162         set_irq_data(irq, iommu);
1163         iommu->irq = irq;
1164
1165         ret = arch_setup_dmar_msi(irq);
1166         if (ret) {
1167                 set_irq_data(irq, NULL);
1168                 iommu->irq = 0;
1169                 destroy_irq(irq);
1170                 return 0;
1171         }
1172
1173         /* Force fault register is cleared */
1174         iommu_page_fault(irq, iommu);
1175
1176         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1177         if (ret)
1178                 printk(KERN_ERR "IOMMU: can't request irq\n");
1179         return ret;
1180 }
1181
1182 static int iommu_init_domains(struct intel_iommu *iommu)
1183 {
1184         unsigned long ndomains;
1185         unsigned long nlongs;
1186
1187         ndomains = cap_ndoms(iommu->cap);
1188         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1189         nlongs = BITS_TO_LONGS(ndomains);
1190
1191         /* TBD: there might be 64K domains,
1192          * consider other allocation for future chip
1193          */
1194         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1195         if (!iommu->domain_ids) {
1196                 printk(KERN_ERR "Allocating domain id array failed\n");
1197                 return -ENOMEM;
1198         }
1199         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1200                         GFP_KERNEL);
1201         if (!iommu->domains) {
1202                 printk(KERN_ERR "Allocating domain array failed\n");
1203                 kfree(iommu->domain_ids);
1204                 return -ENOMEM;
1205         }
1206
1207         spin_lock_init(&iommu->lock);
1208
1209         /*
1210          * if Caching mode is set, then invalid translations are tagged
1211          * with domainid 0. Hence we need to pre-allocate it.
1212          */
1213         if (cap_caching_mode(iommu->cap))
1214                 set_bit(0, iommu->domain_ids);
1215         return 0;
1216 }
1217
1218
1219 static void domain_exit(struct dmar_domain *domain);
1220 static void vm_domain_exit(struct dmar_domain *domain);
1221
1222 void free_dmar_iommu(struct intel_iommu *iommu)
1223 {
1224         struct dmar_domain *domain;
1225         int i;
1226         unsigned long flags;
1227
1228         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1229         for (; i < cap_ndoms(iommu->cap); ) {
1230                 domain = iommu->domains[i];
1231                 clear_bit(i, iommu->domain_ids);
1232
1233                 spin_lock_irqsave(&domain->iommu_lock, flags);
1234                 if (--domain->iommu_count == 0) {
1235                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1236                                 vm_domain_exit(domain);
1237                         else
1238                                 domain_exit(domain);
1239                 }
1240                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1241
1242                 i = find_next_bit(iommu->domain_ids,
1243                         cap_ndoms(iommu->cap), i+1);
1244         }
1245
1246         if (iommu->gcmd & DMA_GCMD_TE)
1247                 iommu_disable_translation(iommu);
1248
1249         if (iommu->irq) {
1250                 set_irq_data(iommu->irq, NULL);
1251                 /* This will mask the irq */
1252                 free_irq(iommu->irq, iommu);
1253                 destroy_irq(iommu->irq);
1254         }
1255
1256         kfree(iommu->domains);
1257         kfree(iommu->domain_ids);
1258
1259         g_iommus[iommu->seq_id] = NULL;
1260
1261         /* if all iommus are freed, free g_iommus */
1262         for (i = 0; i < g_num_of_iommus; i++) {
1263                 if (g_iommus[i])
1264                         break;
1265         }
1266
1267         if (i == g_num_of_iommus)
1268                 kfree(g_iommus);
1269
1270         /* free context mapping */
1271         free_context_table(iommu);
1272 }
1273
1274 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1275 {
1276         unsigned long num;
1277         unsigned long ndomains;
1278         struct dmar_domain *domain;
1279         unsigned long flags;
1280
1281         domain = alloc_domain_mem();
1282         if (!domain)
1283                 return NULL;
1284
1285         ndomains = cap_ndoms(iommu->cap);
1286
1287         spin_lock_irqsave(&iommu->lock, flags);
1288         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1289         if (num >= ndomains) {
1290                 spin_unlock_irqrestore(&iommu->lock, flags);
1291                 free_domain_mem(domain);
1292                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1293                 return NULL;
1294         }
1295
1296         set_bit(num, iommu->domain_ids);
1297         domain->id = num;
1298         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1299         set_bit(iommu->seq_id, &domain->iommu_bmp);
1300         domain->flags = 0;
1301         iommu->domains[num] = domain;
1302         spin_unlock_irqrestore(&iommu->lock, flags);
1303
1304         return domain;
1305 }
1306
1307 static void iommu_free_domain(struct dmar_domain *domain)
1308 {
1309         unsigned long flags;
1310         struct intel_iommu *iommu;
1311
1312         iommu = domain_get_iommu(domain);
1313
1314         spin_lock_irqsave(&iommu->lock, flags);
1315         clear_bit(domain->id, iommu->domain_ids);
1316         spin_unlock_irqrestore(&iommu->lock, flags);
1317 }
1318
1319 static struct iova_domain reserved_iova_list;
1320 static struct lock_class_key reserved_alloc_key;
1321 static struct lock_class_key reserved_rbtree_key;
1322
1323 static void dmar_init_reserved_ranges(void)
1324 {
1325         struct pci_dev *pdev = NULL;
1326         struct iova *iova;
1327         int i;
1328         u64 addr, size;
1329
1330         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1331
1332         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1333                 &reserved_alloc_key);
1334         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1335                 &reserved_rbtree_key);
1336
1337         /* IOAPIC ranges shouldn't be accessed by DMA */
1338         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1339                 IOVA_PFN(IOAPIC_RANGE_END));
1340         if (!iova)
1341                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1342
1343         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1344         for_each_pci_dev(pdev) {
1345                 struct resource *r;
1346
1347                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1348                         r = &pdev->resource[i];
1349                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1350                                 continue;
1351                         addr = r->start;
1352                         addr &= PAGE_MASK;
1353                         size = r->end - addr;
1354                         size = PAGE_ALIGN(size);
1355                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1356                                 IOVA_PFN(size + addr) - 1);
1357                         if (!iova)
1358                                 printk(KERN_ERR "Reserve iova failed\n");
1359                 }
1360         }
1361
1362 }
1363
1364 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1365 {
1366         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1367 }
1368
1369 static inline int guestwidth_to_adjustwidth(int gaw)
1370 {
1371         int agaw;
1372         int r = (gaw - 12) % 9;
1373
1374         if (r == 0)
1375                 agaw = gaw;
1376         else
1377                 agaw = gaw + 9 - r;
1378         if (agaw > 64)
1379                 agaw = 64;
1380         return agaw;
1381 }
1382
1383 static int domain_init(struct dmar_domain *domain, int guest_width)
1384 {
1385         struct intel_iommu *iommu;
1386         int adjust_width, agaw;
1387         unsigned long sagaw;
1388
1389         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1390         spin_lock_init(&domain->mapping_lock);
1391         spin_lock_init(&domain->iommu_lock);
1392
1393         domain_reserve_special_ranges(domain);
1394
1395         /* calculate AGAW */
1396         iommu = domain_get_iommu(domain);
1397         if (guest_width > cap_mgaw(iommu->cap))
1398                 guest_width = cap_mgaw(iommu->cap);
1399         domain->gaw = guest_width;
1400         adjust_width = guestwidth_to_adjustwidth(guest_width);
1401         agaw = width_to_agaw(adjust_width);
1402         sagaw = cap_sagaw(iommu->cap);
1403         if (!test_bit(agaw, &sagaw)) {
1404                 /* hardware doesn't support it, choose a bigger one */
1405                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1406                 agaw = find_next_bit(&sagaw, 5, agaw);
1407                 if (agaw >= 5)
1408                         return -ENODEV;
1409         }
1410         domain->agaw = agaw;
1411         INIT_LIST_HEAD(&domain->devices);
1412
1413         if (ecap_coherent(iommu->ecap))
1414                 domain->iommu_coherency = 1;
1415         else
1416                 domain->iommu_coherency = 0;
1417
1418         domain->iommu_count = 1;
1419
1420         /* always allocate the top pgd */
1421         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1422         if (!domain->pgd)
1423                 return -ENOMEM;
1424         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1425         return 0;
1426 }
1427
1428 static void domain_exit(struct dmar_domain *domain)
1429 {
1430         u64 end;
1431
1432         /* Domain 0 is reserved, so dont process it */
1433         if (!domain)
1434                 return;
1435
1436         domain_remove_dev_info(domain);
1437         /* destroy iovas */
1438         put_iova_domain(&domain->iovad);
1439         end = DOMAIN_MAX_ADDR(domain->gaw);
1440         end = end & (~PAGE_MASK);
1441
1442         /* clear ptes */
1443         dma_pte_clear_range(domain, 0, end);
1444
1445         /* free page tables */
1446         dma_pte_free_pagetable(domain, 0, end);
1447
1448         iommu_free_domain(domain);
1449         free_domain_mem(domain);
1450 }
1451
1452 static int domain_context_mapping_one(struct dmar_domain *domain,
1453                 u8 bus, u8 devfn)
1454 {
1455         struct context_entry *context;
1456         unsigned long flags;
1457         struct intel_iommu *iommu;
1458         struct dma_pte *pgd;
1459         unsigned long num;
1460         unsigned long ndomains;
1461         int id;
1462         int agaw;
1463
1464         pr_debug("Set context mapping for %02x:%02x.%d\n",
1465                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1466         BUG_ON(!domain->pgd);
1467
1468         iommu = device_to_iommu(bus, devfn);
1469         if (!iommu)
1470                 return -ENODEV;
1471
1472         context = device_to_context_entry(iommu, bus, devfn);
1473         if (!context)
1474                 return -ENOMEM;
1475         spin_lock_irqsave(&iommu->lock, flags);
1476         if (context_present(context)) {
1477                 spin_unlock_irqrestore(&iommu->lock, flags);
1478                 return 0;
1479         }
1480
1481         id = domain->id;
1482         pgd = domain->pgd;
1483
1484         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1485                 int found = 0;
1486
1487                 /* find an available domain id for this device in iommu */
1488                 ndomains = cap_ndoms(iommu->cap);
1489                 num = find_first_bit(iommu->domain_ids, ndomains);
1490                 for (; num < ndomains; ) {
1491                         if (iommu->domains[num] == domain) {
1492                                 id = num;
1493                                 found = 1;
1494                                 break;
1495                         }
1496                         num = find_next_bit(iommu->domain_ids,
1497                                             cap_ndoms(iommu->cap), num+1);
1498                 }
1499
1500                 if (found == 0) {
1501                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1502                         if (num >= ndomains) {
1503                                 spin_unlock_irqrestore(&iommu->lock, flags);
1504                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1505                                 return -EFAULT;
1506                         }
1507
1508                         set_bit(num, iommu->domain_ids);
1509                         iommu->domains[num] = domain;
1510                         id = num;
1511                 }
1512
1513                 /* Skip top levels of page tables for
1514                  * iommu which has less agaw than default.
1515                  */
1516                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1517                         pgd = phys_to_virt(dma_pte_addr(pgd));
1518                         if (!dma_pte_present(pgd)) {
1519                                 spin_unlock_irqrestore(&iommu->lock, flags);
1520                                 return -ENOMEM;
1521                         }
1522                 }
1523         }
1524
1525         context_set_domain_id(context, id);
1526         context_set_address_width(context, iommu->agaw);
1527         context_set_address_root(context, virt_to_phys(pgd));
1528         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1529         context_set_fault_enable(context);
1530         context_set_present(context);
1531         domain_flush_cache(domain, context, sizeof(*context));
1532
1533         /* it's a non-present to present mapping */
1534         if (iommu->flush.flush_context(iommu, domain->id,
1535                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1536                 DMA_CCMD_DEVICE_INVL, 1))
1537                 iommu_flush_write_buffer(iommu);
1538         else
1539                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1540
1541         spin_unlock_irqrestore(&iommu->lock, flags);
1542
1543         spin_lock_irqsave(&domain->iommu_lock, flags);
1544         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1545                 domain->iommu_count++;
1546                 domain_update_iommu_coherency(domain);
1547         }
1548         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1549         return 0;
1550 }
1551
1552 static int
1553 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1554 {
1555         int ret;
1556         struct pci_dev *tmp, *parent;
1557
1558         ret = domain_context_mapping_one(domain, pdev->bus->number,
1559                 pdev->devfn);
1560         if (ret)
1561                 return ret;
1562
1563         /* dependent device mapping */
1564         tmp = pci_find_upstream_pcie_bridge(pdev);
1565         if (!tmp)
1566                 return 0;
1567         /* Secondary interface's bus number and devfn 0 */
1568         parent = pdev->bus->self;
1569         while (parent != tmp) {
1570                 ret = domain_context_mapping_one(domain, parent->bus->number,
1571                         parent->devfn);
1572                 if (ret)
1573                         return ret;
1574                 parent = parent->bus->self;
1575         }
1576         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1577                 return domain_context_mapping_one(domain,
1578                         tmp->subordinate->number, 0);
1579         else /* this is a legacy PCI bridge */
1580                 return domain_context_mapping_one(domain,
1581                         tmp->bus->number, tmp->devfn);
1582 }
1583
1584 static int domain_context_mapped(struct pci_dev *pdev)
1585 {
1586         int ret;
1587         struct pci_dev *tmp, *parent;
1588         struct intel_iommu *iommu;
1589
1590         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1591         if (!iommu)
1592                 return -ENODEV;
1593
1594         ret = device_context_mapped(iommu,
1595                 pdev->bus->number, pdev->devfn);
1596         if (!ret)
1597                 return ret;
1598         /* dependent device mapping */
1599         tmp = pci_find_upstream_pcie_bridge(pdev);
1600         if (!tmp)
1601                 return ret;
1602         /* Secondary interface's bus number and devfn 0 */
1603         parent = pdev->bus->self;
1604         while (parent != tmp) {
1605                 ret = device_context_mapped(iommu, parent->bus->number,
1606                         parent->devfn);
1607                 if (!ret)
1608                         return ret;
1609                 parent = parent->bus->self;
1610         }
1611         if (tmp->is_pcie)
1612                 return device_context_mapped(iommu,
1613                         tmp->subordinate->number, 0);
1614         else
1615                 return device_context_mapped(iommu,
1616                         tmp->bus->number, tmp->devfn);
1617 }
1618
1619 static int
1620 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1621                         u64 hpa, size_t size, int prot)
1622 {
1623         u64 start_pfn, end_pfn;
1624         struct dma_pte *pte;
1625         int index;
1626         int addr_width = agaw_to_width(domain->agaw);
1627
1628         hpa &= (((u64)1) << addr_width) - 1;
1629
1630         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1631                 return -EINVAL;
1632         iova &= PAGE_MASK;
1633         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1634         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1635         index = 0;
1636         while (start_pfn < end_pfn) {
1637                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1638                 if (!pte)
1639                         return -ENOMEM;
1640                 /* We don't need lock here, nobody else
1641                  * touches the iova range
1642                  */
1643                 BUG_ON(dma_pte_addr(pte));
1644                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1645                 dma_set_pte_prot(pte, prot);
1646                 domain_flush_cache(domain, pte, sizeof(*pte));
1647                 start_pfn++;
1648                 index++;
1649         }
1650         return 0;
1651 }
1652
1653 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1654 {
1655         if (!iommu)
1656                 return;
1657
1658         clear_context_table(iommu, bus, devfn);
1659         iommu->flush.flush_context(iommu, 0, 0, 0,
1660                                            DMA_CCMD_GLOBAL_INVL, 0);
1661         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1662                                          DMA_TLB_GLOBAL_FLUSH, 0);
1663 }
1664
1665 static void domain_remove_dev_info(struct dmar_domain *domain)
1666 {
1667         struct device_domain_info *info;
1668         unsigned long flags;
1669         struct intel_iommu *iommu;
1670
1671         spin_lock_irqsave(&device_domain_lock, flags);
1672         while (!list_empty(&domain->devices)) {
1673                 info = list_entry(domain->devices.next,
1674                         struct device_domain_info, link);
1675                 list_del(&info->link);
1676                 list_del(&info->global);
1677                 if (info->dev)
1678                         info->dev->dev.archdata.iommu = NULL;
1679                 spin_unlock_irqrestore(&device_domain_lock, flags);
1680
1681                 iommu = device_to_iommu(info->bus, info->devfn);
1682                 iommu_detach_dev(iommu, info->bus, info->devfn);
1683                 free_devinfo_mem(info);
1684
1685                 spin_lock_irqsave(&device_domain_lock, flags);
1686         }
1687         spin_unlock_irqrestore(&device_domain_lock, flags);
1688 }
1689
1690 /*
1691  * find_domain
1692  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1693  */
1694 static struct dmar_domain *
1695 find_domain(struct pci_dev *pdev)
1696 {
1697         struct device_domain_info *info;
1698
1699         /* No lock here, assumes no domain exit in normal case */
1700         info = pdev->dev.archdata.iommu;
1701         if (info)
1702                 return info->domain;
1703         return NULL;
1704 }
1705
1706 /* domain is initialized */
1707 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1708 {
1709         struct dmar_domain *domain, *found = NULL;
1710         struct intel_iommu *iommu;
1711         struct dmar_drhd_unit *drhd;
1712         struct device_domain_info *info, *tmp;
1713         struct pci_dev *dev_tmp;
1714         unsigned long flags;
1715         int bus = 0, devfn = 0;
1716
1717         domain = find_domain(pdev);
1718         if (domain)
1719                 return domain;
1720
1721         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1722         if (dev_tmp) {
1723                 if (dev_tmp->is_pcie) {
1724                         bus = dev_tmp->subordinate->number;
1725                         devfn = 0;
1726                 } else {
1727                         bus = dev_tmp->bus->number;
1728                         devfn = dev_tmp->devfn;
1729                 }
1730                 spin_lock_irqsave(&device_domain_lock, flags);
1731                 list_for_each_entry(info, &device_domain_list, global) {
1732                         if (info->bus == bus && info->devfn == devfn) {
1733                                 found = info->domain;
1734                                 break;
1735                         }
1736                 }
1737                 spin_unlock_irqrestore(&device_domain_lock, flags);
1738                 /* pcie-pci bridge already has a domain, uses it */
1739                 if (found) {
1740                         domain = found;
1741                         goto found_domain;
1742                 }
1743         }
1744
1745         /* Allocate new domain for the device */
1746         drhd = dmar_find_matched_drhd_unit(pdev);
1747         if (!drhd) {
1748                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1749                         pci_name(pdev));
1750                 return NULL;
1751         }
1752         iommu = drhd->iommu;
1753
1754         domain = iommu_alloc_domain(iommu);
1755         if (!domain)
1756                 goto error;
1757
1758         if (domain_init(domain, gaw)) {
1759                 domain_exit(domain);
1760                 goto error;
1761         }
1762
1763         /* register pcie-to-pci device */
1764         if (dev_tmp) {
1765                 info = alloc_devinfo_mem();
1766                 if (!info) {
1767                         domain_exit(domain);
1768                         goto error;
1769                 }
1770                 info->bus = bus;
1771                 info->devfn = devfn;
1772                 info->dev = NULL;
1773                 info->domain = domain;
1774                 /* This domain is shared by devices under p2p bridge */
1775                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1776
1777                 /* pcie-to-pci bridge already has a domain, uses it */
1778                 found = NULL;
1779                 spin_lock_irqsave(&device_domain_lock, flags);
1780                 list_for_each_entry(tmp, &device_domain_list, global) {
1781                         if (tmp->bus == bus && tmp->devfn == devfn) {
1782                                 found = tmp->domain;
1783                                 break;
1784                         }
1785                 }
1786                 if (found) {
1787                         free_devinfo_mem(info);
1788                         domain_exit(domain);
1789                         domain = found;
1790                 } else {
1791                         list_add(&info->link, &domain->devices);
1792                         list_add(&info->global, &device_domain_list);
1793                 }
1794                 spin_unlock_irqrestore(&device_domain_lock, flags);
1795         }
1796
1797 found_domain:
1798         info = alloc_devinfo_mem();
1799         if (!info)
1800                 goto error;
1801         info->bus = pdev->bus->number;
1802         info->devfn = pdev->devfn;
1803         info->dev = pdev;
1804         info->domain = domain;
1805         spin_lock_irqsave(&device_domain_lock, flags);
1806         /* somebody is fast */
1807         found = find_domain(pdev);
1808         if (found != NULL) {
1809                 spin_unlock_irqrestore(&device_domain_lock, flags);
1810                 if (found != domain) {
1811                         domain_exit(domain);
1812                         domain = found;
1813                 }
1814                 free_devinfo_mem(info);
1815                 return domain;
1816         }
1817         list_add(&info->link, &domain->devices);
1818         list_add(&info->global, &device_domain_list);
1819         pdev->dev.archdata.iommu = info;
1820         spin_unlock_irqrestore(&device_domain_lock, flags);
1821         return domain;
1822 error:
1823         /* recheck it here, maybe others set it */
1824         return find_domain(pdev);
1825 }
1826
1827 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1828                                       unsigned long long start,
1829                                       unsigned long long end)
1830 {
1831         struct dmar_domain *domain;
1832         unsigned long size;
1833         unsigned long long base;
1834         int ret;
1835
1836         printk(KERN_INFO
1837                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1838                 pci_name(pdev), start, end);
1839         /* page table init */
1840         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1841         if (!domain)
1842                 return -ENOMEM;
1843
1844         /* The address might not be aligned */
1845         base = start & PAGE_MASK;
1846         size = end - base;
1847         size = PAGE_ALIGN(size);
1848         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1849                         IOVA_PFN(base + size) - 1)) {
1850                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1851                 ret = -ENOMEM;
1852                 goto error;
1853         }
1854
1855         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1856                 size, base, pci_name(pdev));
1857         /*
1858          * RMRR range might have overlap with physical memory range,
1859          * clear it first
1860          */
1861         dma_pte_clear_range(domain, base, base + size);
1862
1863         ret = domain_page_mapping(domain, base, base, size,
1864                 DMA_PTE_READ|DMA_PTE_WRITE);
1865         if (ret)
1866                 goto error;
1867
1868         /* context entry init */
1869         ret = domain_context_mapping(domain, pdev);
1870         if (!ret)
1871                 return 0;
1872 error:
1873         domain_exit(domain);
1874         return ret;
1875
1876 }
1877
1878 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1879         struct pci_dev *pdev)
1880 {
1881         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1882                 return 0;
1883         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1884                 rmrr->end_address + 1);
1885 }
1886
1887 #ifdef CONFIG_DMAR_GFX_WA
1888 struct iommu_prepare_data {
1889         struct pci_dev *pdev;
1890         int ret;
1891 };
1892
1893 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1894                                          unsigned long end_pfn, void *datax)
1895 {
1896         struct iommu_prepare_data *data;
1897
1898         data = (struct iommu_prepare_data *)datax;
1899
1900         data->ret = iommu_prepare_identity_map(data->pdev,
1901                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1902         return data->ret;
1903
1904 }
1905
1906 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1907 {
1908         int nid;
1909         struct iommu_prepare_data data;
1910
1911         data.pdev = pdev;
1912         data.ret = 0;
1913
1914         for_each_online_node(nid) {
1915                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1916                 if (data.ret)
1917                         return data.ret;
1918         }
1919         return data.ret;
1920 }
1921
1922 static void __init iommu_prepare_gfx_mapping(void)
1923 {
1924         struct pci_dev *pdev = NULL;
1925         int ret;
1926
1927         for_each_pci_dev(pdev) {
1928                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1929                                 !IS_GFX_DEVICE(pdev))
1930                         continue;
1931                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1932                         pci_name(pdev));
1933                 ret = iommu_prepare_with_active_regions(pdev);
1934                 if (ret)
1935                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1936         }
1937 }
1938 #else /* !CONFIG_DMAR_GFX_WA */
1939 static inline void iommu_prepare_gfx_mapping(void)
1940 {
1941         return;
1942 }
1943 #endif
1944
1945 #ifdef CONFIG_DMAR_FLOPPY_WA
1946 static inline void iommu_prepare_isa(void)
1947 {
1948         struct pci_dev *pdev;
1949         int ret;
1950
1951         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1952         if (!pdev)
1953                 return;
1954
1955         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1956         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1957
1958         if (ret)
1959                 printk("IOMMU: Failed to create 0-64M identity map, "
1960                         "floppy might not work\n");
1961
1962 }
1963 #else
1964 static inline void iommu_prepare_isa(void)
1965 {
1966         return;
1967 }
1968 #endif /* !CONFIG_DMAR_FLPY_WA */
1969
1970 static int __init init_dmars(void)
1971 {
1972         struct dmar_drhd_unit *drhd;
1973         struct dmar_rmrr_unit *rmrr;
1974         struct pci_dev *pdev;
1975         struct intel_iommu *iommu;
1976         int i, ret, unit = 0;
1977
1978         /*
1979          * for each drhd
1980          *    allocate root
1981          *    initialize and program root entry to not present
1982          * endfor
1983          */
1984         for_each_drhd_unit(drhd) {
1985                 g_num_of_iommus++;
1986                 /*
1987                  * lock not needed as this is only incremented in the single
1988                  * threaded kernel __init code path all other access are read
1989                  * only
1990                  */
1991         }
1992
1993         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1994                         GFP_KERNEL);
1995         if (!g_iommus) {
1996                 printk(KERN_ERR "Allocating global iommu array failed\n");
1997                 ret = -ENOMEM;
1998                 goto error;
1999         }
2000
2001         deferred_flush = kzalloc(g_num_of_iommus *
2002                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2003         if (!deferred_flush) {
2004                 kfree(g_iommus);
2005                 ret = -ENOMEM;
2006                 goto error;
2007         }
2008
2009         for_each_drhd_unit(drhd) {
2010                 if (drhd->ignored)
2011                         continue;
2012
2013                 iommu = drhd->iommu;
2014                 g_iommus[iommu->seq_id] = iommu;
2015
2016                 ret = iommu_init_domains(iommu);
2017                 if (ret)
2018                         goto error;
2019
2020                 /*
2021                  * TBD:
2022                  * we could share the same root & context tables
2023                  * amoung all IOMMU's. Need to Split it later.
2024                  */
2025                 ret = iommu_alloc_root_entry(iommu);
2026                 if (ret) {
2027                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2028                         goto error;
2029                 }
2030         }
2031
2032         for_each_drhd_unit(drhd) {
2033                 if (drhd->ignored)
2034                         continue;
2035
2036                 iommu = drhd->iommu;
2037                 if (dmar_enable_qi(iommu)) {
2038                         /*
2039                          * Queued Invalidate not enabled, use Register Based
2040                          * Invalidate
2041                          */
2042                         iommu->flush.flush_context = __iommu_flush_context;
2043                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2044                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2045                                "invalidation\n",
2046                                (unsigned long long)drhd->reg_base_addr);
2047                 } else {
2048                         iommu->flush.flush_context = qi_flush_context;
2049                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2050                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2051                                "invalidation\n",
2052                                (unsigned long long)drhd->reg_base_addr);
2053                 }
2054         }
2055
2056         /*
2057          * For each rmrr
2058          *   for each dev attached to rmrr
2059          *   do
2060          *     locate drhd for dev, alloc domain for dev
2061          *     allocate free domain
2062          *     allocate page table entries for rmrr
2063          *     if context not allocated for bus
2064          *           allocate and init context
2065          *           set present in root table for this bus
2066          *     init context with domain, translation etc
2067          *    endfor
2068          * endfor
2069          */
2070         for_each_rmrr_units(rmrr) {
2071                 for (i = 0; i < rmrr->devices_cnt; i++) {
2072                         pdev = rmrr->devices[i];
2073                         /* some BIOS lists non-exist devices in DMAR table */
2074                         if (!pdev)
2075                                 continue;
2076                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2077                         if (ret)
2078                                 printk(KERN_ERR
2079                                  "IOMMU: mapping reserved region failed\n");
2080                 }
2081         }
2082
2083         iommu_prepare_gfx_mapping();
2084
2085         iommu_prepare_isa();
2086
2087         /*
2088          * for each drhd
2089          *   enable fault log
2090          *   global invalidate context cache
2091          *   global invalidate iotlb
2092          *   enable translation
2093          */
2094         for_each_drhd_unit(drhd) {
2095                 if (drhd->ignored)
2096                         continue;
2097                 iommu = drhd->iommu;
2098                 sprintf (iommu->name, "dmar%d", unit++);
2099
2100                 iommu_flush_write_buffer(iommu);
2101
2102                 ret = dmar_set_interrupt(iommu);
2103                 if (ret)
2104                         goto error;
2105
2106                 iommu_set_root_entry(iommu);
2107
2108                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2109                                            0);
2110                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2111                                          0);
2112                 iommu_disable_protect_mem_regions(iommu);
2113
2114                 ret = iommu_enable_translation(iommu);
2115                 if (ret)
2116                         goto error;
2117         }
2118
2119         return 0;
2120 error:
2121         for_each_drhd_unit(drhd) {
2122                 if (drhd->ignored)
2123                         continue;
2124                 iommu = drhd->iommu;
2125                 free_iommu(iommu);
2126         }
2127         kfree(g_iommus);
2128         return ret;
2129 }
2130
2131 static inline u64 aligned_size(u64 host_addr, size_t size)
2132 {
2133         u64 addr;
2134         addr = (host_addr & (~PAGE_MASK)) + size;
2135         return PAGE_ALIGN(addr);
2136 }
2137
2138 struct iova *
2139 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2140 {
2141         struct iova *piova;
2142
2143         /* Make sure it's in range */
2144         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2145         if (!size || (IOVA_START_ADDR + size > end))
2146                 return NULL;
2147
2148         piova = alloc_iova(&domain->iovad,
2149                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2150         return piova;
2151 }
2152
2153 static struct iova *
2154 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2155                    size_t size, u64 dma_mask)
2156 {
2157         struct pci_dev *pdev = to_pci_dev(dev);
2158         struct iova *iova = NULL;
2159
2160         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2161                 iova = iommu_alloc_iova(domain, size, dma_mask);
2162         else {
2163                 /*
2164                  * First try to allocate an io virtual address in
2165                  * DMA_32BIT_MASK and if that fails then try allocating
2166                  * from higher range
2167                  */
2168                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2169                 if (!iova)
2170                         iova = iommu_alloc_iova(domain, size, dma_mask);
2171         }
2172
2173         if (!iova) {
2174                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2175                 return NULL;
2176         }
2177
2178         return iova;
2179 }
2180
2181 static struct dmar_domain *
2182 get_valid_domain_for_dev(struct pci_dev *pdev)
2183 {
2184         struct dmar_domain *domain;
2185         int ret;
2186
2187         domain = get_domain_for_dev(pdev,
2188                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2189         if (!domain) {
2190                 printk(KERN_ERR
2191                         "Allocating domain for %s failed", pci_name(pdev));
2192                 return NULL;
2193         }
2194
2195         /* make sure context mapping is ok */
2196         if (unlikely(!domain_context_mapped(pdev))) {
2197                 ret = domain_context_mapping(domain, pdev);
2198                 if (ret) {
2199                         printk(KERN_ERR
2200                                 "Domain context map for %s failed",
2201                                 pci_name(pdev));
2202                         return NULL;
2203                 }
2204         }
2205
2206         return domain;
2207 }
2208
2209 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2210                                      size_t size, int dir, u64 dma_mask)
2211 {
2212         struct pci_dev *pdev = to_pci_dev(hwdev);
2213         struct dmar_domain *domain;
2214         phys_addr_t start_paddr;
2215         struct iova *iova;
2216         int prot = 0;
2217         int ret;
2218         struct intel_iommu *iommu;
2219
2220         BUG_ON(dir == DMA_NONE);
2221         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2222                 return paddr;
2223
2224         domain = get_valid_domain_for_dev(pdev);
2225         if (!domain)
2226                 return 0;
2227
2228         iommu = domain_get_iommu(domain);
2229         size = aligned_size((u64)paddr, size);
2230
2231         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2232         if (!iova)
2233                 goto error;
2234
2235         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2236
2237         /*
2238          * Check if DMAR supports zero-length reads on write only
2239          * mappings..
2240          */
2241         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2242                         !cap_zlr(iommu->cap))
2243                 prot |= DMA_PTE_READ;
2244         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2245                 prot |= DMA_PTE_WRITE;
2246         /*
2247          * paddr - (paddr + size) might be partial page, we should map the whole
2248          * page.  Note: if two part of one page are separately mapped, we
2249          * might have two guest_addr mapping to the same host paddr, but this
2250          * is not a big problem
2251          */
2252         ret = domain_page_mapping(domain, start_paddr,
2253                 ((u64)paddr) & PAGE_MASK, size, prot);
2254         if (ret)
2255                 goto error;
2256
2257         /* it's a non-present to present mapping */
2258         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2259                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2260         if (ret)
2261                 iommu_flush_write_buffer(iommu);
2262
2263         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2264
2265 error:
2266         if (iova)
2267                 __free_iova(&domain->iovad, iova);
2268         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2269                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2270         return 0;
2271 }
2272
2273 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2274                             size_t size, int dir)
2275 {
2276         return __intel_map_single(hwdev, paddr, size, dir,
2277                                   to_pci_dev(hwdev)->dma_mask);
2278 }
2279
2280 static void flush_unmaps(void)
2281 {
2282         int i, j;
2283
2284         timer_on = 0;
2285
2286         /* just flush them all */
2287         for (i = 0; i < g_num_of_iommus; i++) {
2288                 struct intel_iommu *iommu = g_iommus[i];
2289                 if (!iommu)
2290                         continue;
2291
2292                 if (deferred_flush[i].next) {
2293                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2294                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2295                         for (j = 0; j < deferred_flush[i].next; j++) {
2296                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2297                                                 deferred_flush[i].iova[j]);
2298                         }
2299                         deferred_flush[i].next = 0;
2300                 }
2301         }
2302
2303         list_size = 0;
2304 }
2305
2306 static void flush_unmaps_timeout(unsigned long data)
2307 {
2308         unsigned long flags;
2309
2310         spin_lock_irqsave(&async_umap_flush_lock, flags);
2311         flush_unmaps();
2312         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2313 }
2314
2315 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2316 {
2317         unsigned long flags;
2318         int next, iommu_id;
2319         struct intel_iommu *iommu;
2320
2321         spin_lock_irqsave(&async_umap_flush_lock, flags);
2322         if (list_size == HIGH_WATER_MARK)
2323                 flush_unmaps();
2324
2325         iommu = domain_get_iommu(dom);
2326         iommu_id = iommu->seq_id;
2327
2328         next = deferred_flush[iommu_id].next;
2329         deferred_flush[iommu_id].domain[next] = dom;
2330         deferred_flush[iommu_id].iova[next] = iova;
2331         deferred_flush[iommu_id].next++;
2332
2333         if (!timer_on) {
2334                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2335                 timer_on = 1;
2336         }
2337         list_size++;
2338         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2339 }
2340
2341 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2342                         int dir)
2343 {
2344         struct pci_dev *pdev = to_pci_dev(dev);
2345         struct dmar_domain *domain;
2346         unsigned long start_addr;
2347         struct iova *iova;
2348         struct intel_iommu *iommu;
2349
2350         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2351                 return;
2352         domain = find_domain(pdev);
2353         BUG_ON(!domain);
2354
2355         iommu = domain_get_iommu(domain);
2356
2357         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2358         if (!iova)
2359                 return;
2360
2361         start_addr = iova->pfn_lo << PAGE_SHIFT;
2362         size = aligned_size((u64)dev_addr, size);
2363
2364         pr_debug("Device %s unmapping: %lx@%llx\n",
2365                 pci_name(pdev), size, (unsigned long long)start_addr);
2366
2367         /*  clear the whole page */
2368         dma_pte_clear_range(domain, start_addr, start_addr + size);
2369         /* free page tables */
2370         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2371         if (intel_iommu_strict) {
2372                 if (iommu_flush_iotlb_psi(iommu,
2373                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2374                         iommu_flush_write_buffer(iommu);
2375                 /* free iova */
2376                 __free_iova(&domain->iovad, iova);
2377         } else {
2378                 add_unmap(domain, iova);
2379                 /*
2380                  * queue up the release of the unmap to save the 1/6th of the
2381                  * cpu used up by the iotlb flush operation...
2382                  */
2383         }
2384 }
2385
2386 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2387                            dma_addr_t *dma_handle, gfp_t flags)
2388 {
2389         void *vaddr;
2390         int order;
2391
2392         size = PAGE_ALIGN(size);
2393         order = get_order(size);
2394         flags &= ~(GFP_DMA | GFP_DMA32);
2395
2396         vaddr = (void *)__get_free_pages(flags, order);
2397         if (!vaddr)
2398                 return NULL;
2399         memset(vaddr, 0, size);
2400
2401         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2402                                          DMA_BIDIRECTIONAL,
2403                                          hwdev->coherent_dma_mask);
2404         if (*dma_handle)
2405                 return vaddr;
2406         free_pages((unsigned long)vaddr, order);
2407         return NULL;
2408 }
2409
2410 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2411                          dma_addr_t dma_handle)
2412 {
2413         int order;
2414
2415         size = PAGE_ALIGN(size);
2416         order = get_order(size);
2417
2418         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2419         free_pages((unsigned long)vaddr, order);
2420 }
2421
2422 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2423
2424 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2425                     int nelems, int dir)
2426 {
2427         int i;
2428         struct pci_dev *pdev = to_pci_dev(hwdev);
2429         struct dmar_domain *domain;
2430         unsigned long start_addr;
2431         struct iova *iova;
2432         size_t size = 0;
2433         void *addr;
2434         struct scatterlist *sg;
2435         struct intel_iommu *iommu;
2436
2437         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2438                 return;
2439
2440         domain = find_domain(pdev);
2441         BUG_ON(!domain);
2442
2443         iommu = domain_get_iommu(domain);
2444
2445         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2446         if (!iova)
2447                 return;
2448         for_each_sg(sglist, sg, nelems, i) {
2449                 addr = SG_ENT_VIRT_ADDRESS(sg);
2450                 size += aligned_size((u64)addr, sg->length);
2451         }
2452
2453         start_addr = iova->pfn_lo << PAGE_SHIFT;
2454
2455         /*  clear the whole page */
2456         dma_pte_clear_range(domain, start_addr, start_addr + size);
2457         /* free page tables */
2458         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2459
2460         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2461                         size >> VTD_PAGE_SHIFT, 0))
2462                 iommu_flush_write_buffer(iommu);
2463
2464         /* free iova */
2465         __free_iova(&domain->iovad, iova);
2466 }
2467
2468 static int intel_nontranslate_map_sg(struct device *hddev,
2469         struct scatterlist *sglist, int nelems, int dir)
2470 {
2471         int i;
2472         struct scatterlist *sg;
2473
2474         for_each_sg(sglist, sg, nelems, i) {
2475                 BUG_ON(!sg_page(sg));
2476                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2477                 sg->dma_length = sg->length;
2478         }
2479         return nelems;
2480 }
2481
2482 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2483                  int dir)
2484 {
2485         void *addr;
2486         int i;
2487         struct pci_dev *pdev = to_pci_dev(hwdev);
2488         struct dmar_domain *domain;
2489         size_t size = 0;
2490         int prot = 0;
2491         size_t offset = 0;
2492         struct iova *iova = NULL;
2493         int ret;
2494         struct scatterlist *sg;
2495         unsigned long start_addr;
2496         struct intel_iommu *iommu;
2497
2498         BUG_ON(dir == DMA_NONE);
2499         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2500                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2501
2502         domain = get_valid_domain_for_dev(pdev);
2503         if (!domain)
2504                 return 0;
2505
2506         iommu = domain_get_iommu(domain);
2507
2508         for_each_sg(sglist, sg, nelems, i) {
2509                 addr = SG_ENT_VIRT_ADDRESS(sg);
2510                 addr = (void *)virt_to_phys(addr);
2511                 size += aligned_size((u64)addr, sg->length);
2512         }
2513
2514         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2515         if (!iova) {
2516                 sglist->dma_length = 0;
2517                 return 0;
2518         }
2519
2520         /*
2521          * Check if DMAR supports zero-length reads on write only
2522          * mappings..
2523          */
2524         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2525                         !cap_zlr(iommu->cap))
2526                 prot |= DMA_PTE_READ;
2527         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2528                 prot |= DMA_PTE_WRITE;
2529
2530         start_addr = iova->pfn_lo << PAGE_SHIFT;
2531         offset = 0;
2532         for_each_sg(sglist, sg, nelems, i) {
2533                 addr = SG_ENT_VIRT_ADDRESS(sg);
2534                 addr = (void *)virt_to_phys(addr);
2535                 size = aligned_size((u64)addr, sg->length);
2536                 ret = domain_page_mapping(domain, start_addr + offset,
2537                         ((u64)addr) & PAGE_MASK,
2538                         size, prot);
2539                 if (ret) {
2540                         /*  clear the page */
2541                         dma_pte_clear_range(domain, start_addr,
2542                                   start_addr + offset);
2543                         /* free page tables */
2544                         dma_pte_free_pagetable(domain, start_addr,
2545                                   start_addr + offset);
2546                         /* free iova */
2547                         __free_iova(&domain->iovad, iova);
2548                         return 0;
2549                 }
2550                 sg->dma_address = start_addr + offset +
2551                                 ((u64)addr & (~PAGE_MASK));
2552                 sg->dma_length = sg->length;
2553                 offset += size;
2554         }
2555
2556         /* it's a non-present to present mapping */
2557         if (iommu_flush_iotlb_psi(iommu, domain->id,
2558                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2559                 iommu_flush_write_buffer(iommu);
2560         return nelems;
2561 }
2562
2563 static struct dma_mapping_ops intel_dma_ops = {
2564         .alloc_coherent = intel_alloc_coherent,
2565         .free_coherent = intel_free_coherent,
2566         .map_single = intel_map_single,
2567         .unmap_single = intel_unmap_single,
2568         .map_sg = intel_map_sg,
2569         .unmap_sg = intel_unmap_sg,
2570 };
2571
2572 static inline int iommu_domain_cache_init(void)
2573 {
2574         int ret = 0;
2575
2576         iommu_domain_cache = kmem_cache_create("iommu_domain",
2577                                          sizeof(struct dmar_domain),
2578                                          0,
2579                                          SLAB_HWCACHE_ALIGN,
2580
2581                                          NULL);
2582         if (!iommu_domain_cache) {
2583                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2584                 ret = -ENOMEM;
2585         }
2586
2587         return ret;
2588 }
2589
2590 static inline int iommu_devinfo_cache_init(void)
2591 {
2592         int ret = 0;
2593
2594         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2595                                          sizeof(struct device_domain_info),
2596                                          0,
2597                                          SLAB_HWCACHE_ALIGN,
2598                                          NULL);
2599         if (!iommu_devinfo_cache) {
2600                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2601                 ret = -ENOMEM;
2602         }
2603
2604         return ret;
2605 }
2606
2607 static inline int iommu_iova_cache_init(void)
2608 {
2609         int ret = 0;
2610
2611         iommu_iova_cache = kmem_cache_create("iommu_iova",
2612                                          sizeof(struct iova),
2613                                          0,
2614                                          SLAB_HWCACHE_ALIGN,
2615                                          NULL);
2616         if (!iommu_iova_cache) {
2617                 printk(KERN_ERR "Couldn't create iova cache\n");
2618                 ret = -ENOMEM;
2619         }
2620
2621         return ret;
2622 }
2623
2624 static int __init iommu_init_mempool(void)
2625 {
2626         int ret;
2627         ret = iommu_iova_cache_init();
2628         if (ret)
2629                 return ret;
2630
2631         ret = iommu_domain_cache_init();
2632         if (ret)
2633                 goto domain_error;
2634
2635         ret = iommu_devinfo_cache_init();
2636         if (!ret)
2637                 return ret;
2638
2639         kmem_cache_destroy(iommu_domain_cache);
2640 domain_error:
2641         kmem_cache_destroy(iommu_iova_cache);
2642
2643         return -ENOMEM;
2644 }
2645
2646 static void __init iommu_exit_mempool(void)
2647 {
2648         kmem_cache_destroy(iommu_devinfo_cache);
2649         kmem_cache_destroy(iommu_domain_cache);
2650         kmem_cache_destroy(iommu_iova_cache);
2651
2652 }
2653
2654 static void __init init_no_remapping_devices(void)
2655 {
2656         struct dmar_drhd_unit *drhd;
2657
2658         for_each_drhd_unit(drhd) {
2659                 if (!drhd->include_all) {
2660                         int i;
2661                         for (i = 0; i < drhd->devices_cnt; i++)
2662                                 if (drhd->devices[i] != NULL)
2663                                         break;
2664                         /* ignore DMAR unit if no pci devices exist */
2665                         if (i == drhd->devices_cnt)
2666                                 drhd->ignored = 1;
2667                 }
2668         }
2669
2670         if (dmar_map_gfx)
2671                 return;
2672
2673         for_each_drhd_unit(drhd) {
2674                 int i;
2675                 if (drhd->ignored || drhd->include_all)
2676                         continue;
2677
2678                 for (i = 0; i < drhd->devices_cnt; i++)
2679                         if (drhd->devices[i] &&
2680                                 !IS_GFX_DEVICE(drhd->devices[i]))
2681                                 break;
2682
2683                 if (i < drhd->devices_cnt)
2684                         continue;
2685
2686                 /* bypass IOMMU if it is just for gfx devices */
2687                 drhd->ignored = 1;
2688                 for (i = 0; i < drhd->devices_cnt; i++) {
2689                         if (!drhd->devices[i])
2690                                 continue;
2691                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2692                 }
2693         }
2694 }
2695
2696 int __init intel_iommu_init(void)
2697 {
2698         int ret = 0;
2699
2700         if (dmar_table_init())
2701                 return  -ENODEV;
2702
2703         if (dmar_dev_scope_init())
2704                 return  -ENODEV;
2705
2706         /*
2707          * Check the need for DMA-remapping initialization now.
2708          * Above initialization will also be used by Interrupt-remapping.
2709          */
2710         if (no_iommu || swiotlb || dmar_disabled)
2711                 return -ENODEV;
2712
2713         iommu_init_mempool();
2714         dmar_init_reserved_ranges();
2715
2716         init_no_remapping_devices();
2717
2718         ret = init_dmars();
2719         if (ret) {
2720                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2721                 put_iova_domain(&reserved_iova_list);
2722                 iommu_exit_mempool();
2723                 return ret;
2724         }
2725         printk(KERN_INFO
2726         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2727
2728         init_timer(&unmap_timer);
2729         force_iommu = 1;
2730         dma_ops = &intel_dma_ops;
2731         return 0;
2732 }
2733
2734 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2735                                   struct pci_dev *pdev)
2736 {
2737         struct device_domain_info *info;
2738         unsigned long flags;
2739
2740         info = alloc_devinfo_mem();
2741         if (!info)
2742                 return -ENOMEM;
2743
2744         info->bus = pdev->bus->number;
2745         info->devfn = pdev->devfn;
2746         info->dev = pdev;
2747         info->domain = domain;
2748
2749         spin_lock_irqsave(&device_domain_lock, flags);
2750         list_add(&info->link, &domain->devices);
2751         list_add(&info->global, &device_domain_list);
2752         pdev->dev.archdata.iommu = info;
2753         spin_unlock_irqrestore(&device_domain_lock, flags);
2754
2755         return 0;
2756 }
2757
2758 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2759                                           struct pci_dev *pdev)
2760 {
2761         struct device_domain_info *info;
2762         struct intel_iommu *iommu;
2763         unsigned long flags;
2764         int found = 0;
2765         struct list_head *entry, *tmp;
2766
2767         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2768         if (!iommu)
2769                 return;
2770
2771         spin_lock_irqsave(&device_domain_lock, flags);
2772         list_for_each_safe(entry, tmp, &domain->devices) {
2773                 info = list_entry(entry, struct device_domain_info, link);
2774                 if (info->bus == pdev->bus->number &&
2775                     info->devfn == pdev->devfn) {
2776                         list_del(&info->link);
2777                         list_del(&info->global);
2778                         if (info->dev)
2779                                 info->dev->dev.archdata.iommu = NULL;
2780                         spin_unlock_irqrestore(&device_domain_lock, flags);
2781
2782                         iommu_detach_dev(iommu, info->bus, info->devfn);
2783                         free_devinfo_mem(info);
2784
2785                         spin_lock_irqsave(&device_domain_lock, flags);
2786
2787                         if (found)
2788                                 break;
2789                         else
2790                                 continue;
2791                 }
2792
2793                 /* if there is no other devices under the same iommu
2794                  * owned by this domain, clear this iommu in iommu_bmp
2795                  * update iommu count and coherency
2796                  */
2797                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2798                         found = 1;
2799         }
2800
2801         if (found == 0) {
2802                 unsigned long tmp_flags;
2803                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2804                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2805                 domain->iommu_count--;
2806                 domain_update_iommu_coherency(domain);
2807                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2808         }
2809
2810         spin_unlock_irqrestore(&device_domain_lock, flags);
2811 }
2812
2813 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2814 {
2815         struct device_domain_info *info;
2816         struct intel_iommu *iommu;
2817         unsigned long flags1, flags2;
2818
2819         spin_lock_irqsave(&device_domain_lock, flags1);
2820         while (!list_empty(&domain->devices)) {
2821                 info = list_entry(domain->devices.next,
2822                         struct device_domain_info, link);
2823                 list_del(&info->link);
2824                 list_del(&info->global);
2825                 if (info->dev)
2826                         info->dev->dev.archdata.iommu = NULL;
2827
2828                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2829
2830                 iommu = device_to_iommu(info->bus, info->devfn);
2831                 iommu_detach_dev(iommu, info->bus, info->devfn);
2832
2833                 /* clear this iommu in iommu_bmp, update iommu count
2834                  * and coherency
2835                  */
2836                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2837                 if (test_and_clear_bit(iommu->seq_id,
2838                                        &domain->iommu_bmp)) {
2839                         domain->iommu_count--;
2840                         domain_update_iommu_coherency(domain);
2841                 }
2842                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2843
2844                 free_devinfo_mem(info);
2845                 spin_lock_irqsave(&device_domain_lock, flags1);
2846         }
2847         spin_unlock_irqrestore(&device_domain_lock, flags1);
2848 }
2849
2850 /* domain id for virtual machine, it won't be set in context */
2851 static unsigned long vm_domid;
2852
2853 static int vm_domain_min_agaw(struct dmar_domain *domain)
2854 {
2855         int i;
2856         int min_agaw = domain->agaw;
2857
2858         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2859         for (; i < g_num_of_iommus; ) {
2860                 if (min_agaw > g_iommus[i]->agaw)
2861                         min_agaw = g_iommus[i]->agaw;
2862
2863                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2864         }
2865
2866         return min_agaw;
2867 }
2868
2869 static struct dmar_domain *iommu_alloc_vm_domain(void)
2870 {
2871         struct dmar_domain *domain;
2872
2873         domain = alloc_domain_mem();
2874         if (!domain)
2875                 return NULL;
2876
2877         domain->id = vm_domid++;
2878         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2879         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2880
2881         return domain;
2882 }
2883
2884 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2885 {
2886         int adjust_width;
2887
2888         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2889         spin_lock_init(&domain->mapping_lock);
2890         spin_lock_init(&domain->iommu_lock);
2891
2892         domain_reserve_special_ranges(domain);
2893
2894         /* calculate AGAW */
2895         domain->gaw = guest_width;
2896         adjust_width = guestwidth_to_adjustwidth(guest_width);
2897         domain->agaw = width_to_agaw(adjust_width);
2898
2899         INIT_LIST_HEAD(&domain->devices);
2900
2901         domain->iommu_count = 0;
2902         domain->iommu_coherency = 0;
2903         domain->max_addr = 0;
2904
2905         /* always allocate the top pgd */
2906         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2907         if (!domain->pgd)
2908                 return -ENOMEM;
2909         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2910         return 0;
2911 }
2912
2913 static void iommu_free_vm_domain(struct dmar_domain *domain)
2914 {
2915         unsigned long flags;
2916         struct dmar_drhd_unit *drhd;
2917         struct intel_iommu *iommu;
2918         unsigned long i;
2919         unsigned long ndomains;
2920
2921         for_each_drhd_unit(drhd) {
2922                 if (drhd->ignored)
2923                         continue;
2924                 iommu = drhd->iommu;
2925
2926                 ndomains = cap_ndoms(iommu->cap);
2927                 i = find_first_bit(iommu->domain_ids, ndomains);
2928                 for (; i < ndomains; ) {
2929                         if (iommu->domains[i] == domain) {
2930                                 spin_lock_irqsave(&iommu->lock, flags);
2931                                 clear_bit(i, iommu->domain_ids);
2932                                 iommu->domains[i] = NULL;
2933                                 spin_unlock_irqrestore(&iommu->lock, flags);
2934                                 break;
2935                         }
2936                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2937                 }
2938         }
2939 }
2940
2941 static void vm_domain_exit(struct dmar_domain *domain)
2942 {
2943         u64 end;
2944
2945         /* Domain 0 is reserved, so dont process it */
2946         if (!domain)
2947                 return;
2948
2949         vm_domain_remove_all_dev_info(domain);
2950         /* destroy iovas */
2951         put_iova_domain(&domain->iovad);
2952         end = DOMAIN_MAX_ADDR(domain->gaw);
2953         end = end & (~VTD_PAGE_MASK);
2954
2955         /* clear ptes */
2956         dma_pte_clear_range(domain, 0, end);
2957
2958         /* free page tables */
2959         dma_pte_free_pagetable(domain, 0, end);
2960
2961         iommu_free_vm_domain(domain);
2962         free_domain_mem(domain);
2963 }
2964
2965 struct dmar_domain *intel_iommu_alloc_domain(void)
2966 {
2967         struct dmar_domain *domain;
2968
2969         domain = iommu_alloc_vm_domain();
2970         if (!domain) {
2971                 printk(KERN_ERR
2972                         "intel_iommu_domain_alloc: domain == NULL\n");
2973                 return NULL;
2974         }
2975         if (vm_domain_init(domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2976                 printk(KERN_ERR
2977                         "intel_iommu_domain_alloc: domain_init() failed\n");
2978                 vm_domain_exit(domain);
2979                 return NULL;
2980         }
2981
2982         return domain;
2983 }
2984 EXPORT_SYMBOL_GPL(intel_iommu_alloc_domain);
2985
2986 void intel_iommu_free_domain(struct dmar_domain *domain)
2987 {
2988         vm_domain_exit(domain);
2989 }
2990 EXPORT_SYMBOL_GPL(intel_iommu_free_domain);
2991
2992 int intel_iommu_attach_device(struct dmar_domain *domain,
2993                               struct pci_dev *pdev)
2994 {
2995         struct intel_iommu *iommu;
2996         int addr_width;
2997         u64 end;
2998         int ret;
2999
3000         /* normally pdev is not mapped */
3001         if (unlikely(domain_context_mapped(pdev))) {
3002                 struct dmar_domain *old_domain;
3003
3004                 old_domain = find_domain(pdev);
3005                 if (old_domain) {
3006                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3007                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3008                         else
3009                                 domain_remove_dev_info(old_domain);
3010                 }
3011         }
3012
3013         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3014         if (!iommu)
3015                 return -ENODEV;
3016
3017         /* check if this iommu agaw is sufficient for max mapped address */
3018         addr_width = agaw_to_width(iommu->agaw);
3019         end = DOMAIN_MAX_ADDR(addr_width);
3020         end = end & VTD_PAGE_MASK;
3021         if (end < domain->max_addr) {
3022                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3023                        "sufficient for the mapped address (%llx)\n",
3024                        __func__, iommu->agaw, domain->max_addr);
3025                 return -EFAULT;
3026         }
3027
3028         ret = domain_context_mapping(domain, pdev);
3029         if (ret)
3030                 return ret;
3031
3032         ret = vm_domain_add_dev_info(domain, pdev);
3033         return ret;
3034 }
3035 EXPORT_SYMBOL_GPL(intel_iommu_attach_device);
3036
3037 void intel_iommu_detach_device(struct dmar_domain *domain,
3038                                struct pci_dev *pdev)
3039 {
3040         vm_domain_remove_one_dev_info(domain, pdev);
3041 }
3042 EXPORT_SYMBOL_GPL(intel_iommu_detach_device);
3043
3044 int intel_iommu_map_address(struct dmar_domain *domain, dma_addr_t iova,
3045                             u64 hpa, size_t size, int prot)
3046 {
3047         u64 max_addr;
3048         int addr_width;
3049         int ret;
3050
3051         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3052         if (domain->max_addr < max_addr) {
3053                 int min_agaw;
3054                 u64 end;
3055
3056                 /* check if minimum agaw is sufficient for mapped address */
3057                 min_agaw = vm_domain_min_agaw(domain);
3058                 addr_width = agaw_to_width(min_agaw);
3059                 end = DOMAIN_MAX_ADDR(addr_width);
3060                 end = end & VTD_PAGE_MASK;
3061                 if (end < max_addr) {
3062                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3063                                "sufficient for the mapped address (%llx)\n",
3064                                __func__, min_agaw, max_addr);
3065                         return -EFAULT;
3066                 }
3067                 domain->max_addr = max_addr;
3068         }
3069
3070         ret = domain_page_mapping(domain, iova, hpa, size, prot);
3071         return ret;
3072 }
3073 EXPORT_SYMBOL_GPL(intel_iommu_map_address);
3074
3075 void intel_iommu_unmap_address(struct dmar_domain *domain,
3076                                dma_addr_t iova, size_t size)
3077 {
3078         dma_addr_t base;
3079
3080         /* The address might not be aligned */
3081         base = iova & VTD_PAGE_MASK;
3082         size = VTD_PAGE_ALIGN(size);
3083         dma_pte_clear_range(domain, base, base + size);
3084
3085         if (domain->max_addr == base + size)
3086                 domain->max_addr = base;
3087 }
3088 EXPORT_SYMBOL_GPL(intel_iommu_unmap_address);
3089
3090 int intel_iommu_found(void)
3091 {
3092         return g_num_of_iommus;
3093 }
3094 EXPORT_SYMBOL_GPL(intel_iommu_found);
3095
3096 u64 intel_iommu_iova_to_phys(struct dmar_domain *domain, u64 iova)
3097 {
3098         struct dma_pte *pte;
3099         u64 phys = 0;
3100
3101         pte = addr_to_dma_pte(domain, iova);
3102         if (pte)
3103                 phys = dma_pte_addr(pte);
3104
3105         return phys;
3106 }
3107 EXPORT_SYMBOL_GPL(intel_iommu_iova_to_phys);