PCI: Suspend and resume PCI Express ports with interrupts disabled
[linux-2.6] / drivers / pci / intel-iommu.c
1 /*
2  * Copyright (c) 2006, Intel Corporation.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms and conditions of the GNU General Public License,
6  * version 2, as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope it will be useful, but WITHOUT
9  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
11  * more details.
12  *
13  * You should have received a copy of the GNU General Public License along with
14  * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
15  * Place - Suite 330, Boston, MA 02111-1307 USA.
16  *
17  * Copyright (C) 2006-2008 Intel Corporation
18  * Author: Ashok Raj <ashok.raj@intel.com>
19  * Author: Shaohua Li <shaohua.li@intel.com>
20  * Author: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
21  * Author: Fenghua Yu <fenghua.yu@intel.com>
22  */
23
24 #include <linux/init.h>
25 #include <linux/bitmap.h>
26 #include <linux/debugfs.h>
27 #include <linux/slab.h>
28 #include <linux/irq.h>
29 #include <linux/interrupt.h>
30 #include <linux/spinlock.h>
31 #include <linux/pci.h>
32 #include <linux/dmar.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/mempool.h>
35 #include <linux/timer.h>
36 #include <linux/iova.h>
37 #include <linux/iommu.h>
38 #include <linux/intel-iommu.h>
39 #include <asm/cacheflush.h>
40 #include <asm/iommu.h>
41 #include "pci.h"
42
43 #define ROOT_SIZE               VTD_PAGE_SIZE
44 #define CONTEXT_SIZE            VTD_PAGE_SIZE
45
46 #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY)
47 #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA)
48
49 #define IOAPIC_RANGE_START      (0xfee00000)
50 #define IOAPIC_RANGE_END        (0xfeefffff)
51 #define IOVA_START_ADDR         (0x1000)
52
53 #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48
54
55 #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1)
56
57 #define IOVA_PFN(addr)          ((addr) >> PAGE_SHIFT)
58 #define DMA_32BIT_PFN           IOVA_PFN(DMA_32BIT_MASK)
59 #define DMA_64BIT_PFN           IOVA_PFN(DMA_64BIT_MASK)
60
61 /* global iommu list, set NULL for ignored DMAR units */
62 static struct intel_iommu **g_iommus;
63
64 /*
65  * 0: Present
66  * 1-11: Reserved
67  * 12-63: Context Ptr (12 - (haw-1))
68  * 64-127: Reserved
69  */
70 struct root_entry {
71         u64     val;
72         u64     rsvd1;
73 };
74 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
75 static inline bool root_present(struct root_entry *root)
76 {
77         return (root->val & 1);
78 }
79 static inline void set_root_present(struct root_entry *root)
80 {
81         root->val |= 1;
82 }
83 static inline void set_root_value(struct root_entry *root, unsigned long value)
84 {
85         root->val |= value & VTD_PAGE_MASK;
86 }
87
88 static inline struct context_entry *
89 get_context_addr_from_root(struct root_entry *root)
90 {
91         return (struct context_entry *)
92                 (root_present(root)?phys_to_virt(
93                 root->val & VTD_PAGE_MASK) :
94                 NULL);
95 }
96
97 /*
98  * low 64 bits:
99  * 0: present
100  * 1: fault processing disable
101  * 2-3: translation type
102  * 12-63: address space root
103  * high 64 bits:
104  * 0-2: address width
105  * 3-6: aval
106  * 8-23: domain id
107  */
108 struct context_entry {
109         u64 lo;
110         u64 hi;
111 };
112
113 static inline bool context_present(struct context_entry *context)
114 {
115         return (context->lo & 1);
116 }
117 static inline void context_set_present(struct context_entry *context)
118 {
119         context->lo |= 1;
120 }
121
122 static inline void context_set_fault_enable(struct context_entry *context)
123 {
124         context->lo &= (((u64)-1) << 2) | 1;
125 }
126
127 #define CONTEXT_TT_MULTI_LEVEL 0
128
129 static inline void context_set_translation_type(struct context_entry *context,
130                                                 unsigned long value)
131 {
132         context->lo &= (((u64)-1) << 4) | 3;
133         context->lo |= (value & 3) << 2;
134 }
135
136 static inline void context_set_address_root(struct context_entry *context,
137                                             unsigned long value)
138 {
139         context->lo |= value & VTD_PAGE_MASK;
140 }
141
142 static inline void context_set_address_width(struct context_entry *context,
143                                              unsigned long value)
144 {
145         context->hi |= value & 7;
146 }
147
148 static inline void context_set_domain_id(struct context_entry *context,
149                                          unsigned long value)
150 {
151         context->hi |= (value & ((1 << 16) - 1)) << 8;
152 }
153
154 static inline void context_clear_entry(struct context_entry *context)
155 {
156         context->lo = 0;
157         context->hi = 0;
158 }
159
160 /*
161  * 0: readable
162  * 1: writable
163  * 2-6: reserved
164  * 7: super page
165  * 8-11: available
166  * 12-63: Host physcial address
167  */
168 struct dma_pte {
169         u64 val;
170 };
171
172 static inline void dma_clear_pte(struct dma_pte *pte)
173 {
174         pte->val = 0;
175 }
176
177 static inline void dma_set_pte_readable(struct dma_pte *pte)
178 {
179         pte->val |= DMA_PTE_READ;
180 }
181
182 static inline void dma_set_pte_writable(struct dma_pte *pte)
183 {
184         pte->val |= DMA_PTE_WRITE;
185 }
186
187 static inline void dma_set_pte_prot(struct dma_pte *pte, unsigned long prot)
188 {
189         pte->val = (pte->val & ~3) | (prot & 3);
190 }
191
192 static inline u64 dma_pte_addr(struct dma_pte *pte)
193 {
194         return (pte->val & VTD_PAGE_MASK);
195 }
196
197 static inline void dma_set_pte_addr(struct dma_pte *pte, u64 addr)
198 {
199         pte->val |= (addr & VTD_PAGE_MASK);
200 }
201
202 static inline bool dma_pte_present(struct dma_pte *pte)
203 {
204         return (pte->val & 3) != 0;
205 }
206
207 /* devices under the same p2p bridge are owned in one domain */
208 #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0)
209
210 /* domain represents a virtual machine, more than one devices
211  * across iommus may be owned in one domain, e.g. kvm guest.
212  */
213 #define DOMAIN_FLAG_VIRTUAL_MACHINE     (1 << 1)
214
215 struct dmar_domain {
216         int     id;                     /* domain id */
217         unsigned long iommu_bmp;        /* bitmap of iommus this domain uses*/
218
219         struct list_head devices;       /* all devices' list */
220         struct iova_domain iovad;       /* iova's that belong to this domain */
221
222         struct dma_pte  *pgd;           /* virtual address */
223         spinlock_t      mapping_lock;   /* page table lock */
224         int             gaw;            /* max guest address width */
225
226         /* adjusted guest address width, 0 is level 2 30-bit */
227         int             agaw;
228
229         int             flags;          /* flags to find out type of domain */
230
231         int             iommu_coherency;/* indicate coherency of iommu access */
232         int             iommu_count;    /* reference count of iommu */
233         spinlock_t      iommu_lock;     /* protect iommu set in domain */
234         u64             max_addr;       /* maximum mapped address */
235 };
236
237 /* PCI domain-device relationship */
238 struct device_domain_info {
239         struct list_head link;  /* link to domain siblings */
240         struct list_head global; /* link to global list */
241         u8 bus;                 /* PCI bus numer */
242         u8 devfn;               /* PCI devfn number */
243         struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */
244         struct dmar_domain *domain; /* pointer to domain */
245 };
246
247 static void flush_unmaps_timeout(unsigned long data);
248
249 DEFINE_TIMER(unmap_timer,  flush_unmaps_timeout, 0, 0);
250
251 #define HIGH_WATER_MARK 250
252 struct deferred_flush_tables {
253         int next;
254         struct iova *iova[HIGH_WATER_MARK];
255         struct dmar_domain *domain[HIGH_WATER_MARK];
256 };
257
258 static struct deferred_flush_tables *deferred_flush;
259
260 /* bitmap for indexing intel_iommus */
261 static int g_num_of_iommus;
262
263 static DEFINE_SPINLOCK(async_umap_flush_lock);
264 static LIST_HEAD(unmaps_to_do);
265
266 static int timer_on;
267 static long list_size;
268
269 static void domain_remove_dev_info(struct dmar_domain *domain);
270
271 int dmar_disabled;
272 static int __initdata dmar_map_gfx = 1;
273 static int dmar_forcedac;
274 static int intel_iommu_strict;
275
276 #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1))
277 static DEFINE_SPINLOCK(device_domain_lock);
278 static LIST_HEAD(device_domain_list);
279
280 static struct iommu_ops intel_iommu_ops;
281
282 static int __init intel_iommu_setup(char *str)
283 {
284         if (!str)
285                 return -EINVAL;
286         while (*str) {
287                 if (!strncmp(str, "off", 3)) {
288                         dmar_disabled = 1;
289                         printk(KERN_INFO"Intel-IOMMU: disabled\n");
290                 } else if (!strncmp(str, "igfx_off", 8)) {
291                         dmar_map_gfx = 0;
292                         printk(KERN_INFO
293                                 "Intel-IOMMU: disable GFX device mapping\n");
294                 } else if (!strncmp(str, "forcedac", 8)) {
295                         printk(KERN_INFO
296                                 "Intel-IOMMU: Forcing DAC for PCI devices\n");
297                         dmar_forcedac = 1;
298                 } else if (!strncmp(str, "strict", 6)) {
299                         printk(KERN_INFO
300                                 "Intel-IOMMU: disable batched IOTLB flush\n");
301                         intel_iommu_strict = 1;
302                 }
303
304                 str += strcspn(str, ",");
305                 while (*str == ',')
306                         str++;
307         }
308         return 0;
309 }
310 __setup("intel_iommu=", intel_iommu_setup);
311
312 static struct kmem_cache *iommu_domain_cache;
313 static struct kmem_cache *iommu_devinfo_cache;
314 static struct kmem_cache *iommu_iova_cache;
315
316 static inline void *iommu_kmem_cache_alloc(struct kmem_cache *cachep)
317 {
318         unsigned int flags;
319         void *vaddr;
320
321         /* trying to avoid low memory issues */
322         flags = current->flags & PF_MEMALLOC;
323         current->flags |= PF_MEMALLOC;
324         vaddr = kmem_cache_alloc(cachep, GFP_ATOMIC);
325         current->flags &= (~PF_MEMALLOC | flags);
326         return vaddr;
327 }
328
329
330 static inline void *alloc_pgtable_page(void)
331 {
332         unsigned int flags;
333         void *vaddr;
334
335         /* trying to avoid low memory issues */
336         flags = current->flags & PF_MEMALLOC;
337         current->flags |= PF_MEMALLOC;
338         vaddr = (void *)get_zeroed_page(GFP_ATOMIC);
339         current->flags &= (~PF_MEMALLOC | flags);
340         return vaddr;
341 }
342
343 static inline void free_pgtable_page(void *vaddr)
344 {
345         free_page((unsigned long)vaddr);
346 }
347
348 static inline void *alloc_domain_mem(void)
349 {
350         return iommu_kmem_cache_alloc(iommu_domain_cache);
351 }
352
353 static void free_domain_mem(void *vaddr)
354 {
355         kmem_cache_free(iommu_domain_cache, vaddr);
356 }
357
358 static inline void * alloc_devinfo_mem(void)
359 {
360         return iommu_kmem_cache_alloc(iommu_devinfo_cache);
361 }
362
363 static inline void free_devinfo_mem(void *vaddr)
364 {
365         kmem_cache_free(iommu_devinfo_cache, vaddr);
366 }
367
368 struct iova *alloc_iova_mem(void)
369 {
370         return iommu_kmem_cache_alloc(iommu_iova_cache);
371 }
372
373 void free_iova_mem(struct iova *iova)
374 {
375         kmem_cache_free(iommu_iova_cache, iova);
376 }
377
378
379 static inline int width_to_agaw(int width);
380
381 /* calculate agaw for each iommu.
382  * "SAGAW" may be different across iommus, use a default agaw, and
383  * get a supported less agaw for iommus that don't support the default agaw.
384  */
385 int iommu_calculate_agaw(struct intel_iommu *iommu)
386 {
387         unsigned long sagaw;
388         int agaw = -1;
389
390         sagaw = cap_sagaw(iommu->cap);
391         for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH);
392              agaw >= 0; agaw--) {
393                 if (test_bit(agaw, &sagaw))
394                         break;
395         }
396
397         return agaw;
398 }
399
400 /* in native case, each domain is related to only one iommu */
401 static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain)
402 {
403         int iommu_id;
404
405         BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE);
406
407         iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
408         if (iommu_id < 0 || iommu_id >= g_num_of_iommus)
409                 return NULL;
410
411         return g_iommus[iommu_id];
412 }
413
414 /* "Coherency" capability may be different across iommus */
415 static void domain_update_iommu_coherency(struct dmar_domain *domain)
416 {
417         int i;
418
419         domain->iommu_coherency = 1;
420
421         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
422         for (; i < g_num_of_iommus; ) {
423                 if (!ecap_coherent(g_iommus[i]->ecap)) {
424                         domain->iommu_coherency = 0;
425                         break;
426                 }
427                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
428         }
429 }
430
431 static struct intel_iommu *device_to_iommu(u8 bus, u8 devfn)
432 {
433         struct dmar_drhd_unit *drhd = NULL;
434         int i;
435
436         for_each_drhd_unit(drhd) {
437                 if (drhd->ignored)
438                         continue;
439
440                 for (i = 0; i < drhd->devices_cnt; i++)
441                         if (drhd->devices[i]->bus->number == bus &&
442                             drhd->devices[i]->devfn == devfn)
443                                 return drhd->iommu;
444
445                 if (drhd->include_all)
446                         return drhd->iommu;
447         }
448
449         return NULL;
450 }
451
452 static void domain_flush_cache(struct dmar_domain *domain,
453                                void *addr, int size)
454 {
455         if (!domain->iommu_coherency)
456                 clflush_cache_range(addr, size);
457 }
458
459 /* Gets context entry for a given bus and devfn */
460 static struct context_entry * device_to_context_entry(struct intel_iommu *iommu,
461                 u8 bus, u8 devfn)
462 {
463         struct root_entry *root;
464         struct context_entry *context;
465         unsigned long phy_addr;
466         unsigned long flags;
467
468         spin_lock_irqsave(&iommu->lock, flags);
469         root = &iommu->root_entry[bus];
470         context = get_context_addr_from_root(root);
471         if (!context) {
472                 context = (struct context_entry *)alloc_pgtable_page();
473                 if (!context) {
474                         spin_unlock_irqrestore(&iommu->lock, flags);
475                         return NULL;
476                 }
477                 __iommu_flush_cache(iommu, (void *)context, CONTEXT_SIZE);
478                 phy_addr = virt_to_phys((void *)context);
479                 set_root_value(root, phy_addr);
480                 set_root_present(root);
481                 __iommu_flush_cache(iommu, root, sizeof(*root));
482         }
483         spin_unlock_irqrestore(&iommu->lock, flags);
484         return &context[devfn];
485 }
486
487 static int device_context_mapped(struct intel_iommu *iommu, u8 bus, u8 devfn)
488 {
489         struct root_entry *root;
490         struct context_entry *context;
491         int ret;
492         unsigned long flags;
493
494         spin_lock_irqsave(&iommu->lock, flags);
495         root = &iommu->root_entry[bus];
496         context = get_context_addr_from_root(root);
497         if (!context) {
498                 ret = 0;
499                 goto out;
500         }
501         ret = context_present(&context[devfn]);
502 out:
503         spin_unlock_irqrestore(&iommu->lock, flags);
504         return ret;
505 }
506
507 static void clear_context_table(struct intel_iommu *iommu, u8 bus, u8 devfn)
508 {
509         struct root_entry *root;
510         struct context_entry *context;
511         unsigned long flags;
512
513         spin_lock_irqsave(&iommu->lock, flags);
514         root = &iommu->root_entry[bus];
515         context = get_context_addr_from_root(root);
516         if (context) {
517                 context_clear_entry(&context[devfn]);
518                 __iommu_flush_cache(iommu, &context[devfn], \
519                         sizeof(*context));
520         }
521         spin_unlock_irqrestore(&iommu->lock, flags);
522 }
523
524 static void free_context_table(struct intel_iommu *iommu)
525 {
526         struct root_entry *root;
527         int i;
528         unsigned long flags;
529         struct context_entry *context;
530
531         spin_lock_irqsave(&iommu->lock, flags);
532         if (!iommu->root_entry) {
533                 goto out;
534         }
535         for (i = 0; i < ROOT_ENTRY_NR; i++) {
536                 root = &iommu->root_entry[i];
537                 context = get_context_addr_from_root(root);
538                 if (context)
539                         free_pgtable_page(context);
540         }
541         free_pgtable_page(iommu->root_entry);
542         iommu->root_entry = NULL;
543 out:
544         spin_unlock_irqrestore(&iommu->lock, flags);
545 }
546
547 /* page table handling */
548 #define LEVEL_STRIDE            (9)
549 #define LEVEL_MASK              (((u64)1 << LEVEL_STRIDE) - 1)
550
551 static inline int agaw_to_level(int agaw)
552 {
553         return agaw + 2;
554 }
555
556 static inline int agaw_to_width(int agaw)
557 {
558         return 30 + agaw * LEVEL_STRIDE;
559
560 }
561
562 static inline int width_to_agaw(int width)
563 {
564         return (width - 30) / LEVEL_STRIDE;
565 }
566
567 static inline unsigned int level_to_offset_bits(int level)
568 {
569         return (12 + (level - 1) * LEVEL_STRIDE);
570 }
571
572 static inline int address_level_offset(u64 addr, int level)
573 {
574         return ((addr >> level_to_offset_bits(level)) & LEVEL_MASK);
575 }
576
577 static inline u64 level_mask(int level)
578 {
579         return ((u64)-1 << level_to_offset_bits(level));
580 }
581
582 static inline u64 level_size(int level)
583 {
584         return ((u64)1 << level_to_offset_bits(level));
585 }
586
587 static inline u64 align_to_level(u64 addr, int level)
588 {
589         return ((addr + level_size(level) - 1) & level_mask(level));
590 }
591
592 static struct dma_pte * addr_to_dma_pte(struct dmar_domain *domain, u64 addr)
593 {
594         int addr_width = agaw_to_width(domain->agaw);
595         struct dma_pte *parent, *pte = NULL;
596         int level = agaw_to_level(domain->agaw);
597         int offset;
598         unsigned long flags;
599
600         BUG_ON(!domain->pgd);
601
602         addr &= (((u64)1) << addr_width) - 1;
603         parent = domain->pgd;
604
605         spin_lock_irqsave(&domain->mapping_lock, flags);
606         while (level > 0) {
607                 void *tmp_page;
608
609                 offset = address_level_offset(addr, level);
610                 pte = &parent[offset];
611                 if (level == 1)
612                         break;
613
614                 if (!dma_pte_present(pte)) {
615                         tmp_page = alloc_pgtable_page();
616
617                         if (!tmp_page) {
618                                 spin_unlock_irqrestore(&domain->mapping_lock,
619                                         flags);
620                                 return NULL;
621                         }
622                         domain_flush_cache(domain, tmp_page, PAGE_SIZE);
623                         dma_set_pte_addr(pte, virt_to_phys(tmp_page));
624                         /*
625                          * high level table always sets r/w, last level page
626                          * table control read/write
627                          */
628                         dma_set_pte_readable(pte);
629                         dma_set_pte_writable(pte);
630                         domain_flush_cache(domain, pte, sizeof(*pte));
631                 }
632                 parent = phys_to_virt(dma_pte_addr(pte));
633                 level--;
634         }
635
636         spin_unlock_irqrestore(&domain->mapping_lock, flags);
637         return pte;
638 }
639
640 /* return address's pte at specific level */
641 static struct dma_pte *dma_addr_level_pte(struct dmar_domain *domain, u64 addr,
642                 int level)
643 {
644         struct dma_pte *parent, *pte = NULL;
645         int total = agaw_to_level(domain->agaw);
646         int offset;
647
648         parent = domain->pgd;
649         while (level <= total) {
650                 offset = address_level_offset(addr, total);
651                 pte = &parent[offset];
652                 if (level == total)
653                         return pte;
654
655                 if (!dma_pte_present(pte))
656                         break;
657                 parent = phys_to_virt(dma_pte_addr(pte));
658                 total--;
659         }
660         return NULL;
661 }
662
663 /* clear one page's page table */
664 static void dma_pte_clear_one(struct dmar_domain *domain, u64 addr)
665 {
666         struct dma_pte *pte = NULL;
667
668         /* get last level pte */
669         pte = dma_addr_level_pte(domain, addr, 1);
670
671         if (pte) {
672                 dma_clear_pte(pte);
673                 domain_flush_cache(domain, pte, sizeof(*pte));
674         }
675 }
676
677 /* clear last level pte, a tlb flush should be followed */
678 static void dma_pte_clear_range(struct dmar_domain *domain, u64 start, u64 end)
679 {
680         int addr_width = agaw_to_width(domain->agaw);
681
682         start &= (((u64)1) << addr_width) - 1;
683         end &= (((u64)1) << addr_width) - 1;
684         /* in case it's partial page */
685         start = PAGE_ALIGN(start);
686         end &= PAGE_MASK;
687
688         /* we don't need lock here, nobody else touches the iova range */
689         while (start < end) {
690                 dma_pte_clear_one(domain, start);
691                 start += VTD_PAGE_SIZE;
692         }
693 }
694
695 /* free page table pages. last level pte should already be cleared */
696 static void dma_pte_free_pagetable(struct dmar_domain *domain,
697         u64 start, u64 end)
698 {
699         int addr_width = agaw_to_width(domain->agaw);
700         struct dma_pte *pte;
701         int total = agaw_to_level(domain->agaw);
702         int level;
703         u64 tmp;
704
705         start &= (((u64)1) << addr_width) - 1;
706         end &= (((u64)1) << addr_width) - 1;
707
708         /* we don't need lock here, nobody else touches the iova range */
709         level = 2;
710         while (level <= total) {
711                 tmp = align_to_level(start, level);
712                 if (tmp >= end || (tmp + level_size(level) > end))
713                         return;
714
715                 while (tmp < end) {
716                         pte = dma_addr_level_pte(domain, tmp, level);
717                         if (pte) {
718                                 free_pgtable_page(
719                                         phys_to_virt(dma_pte_addr(pte)));
720                                 dma_clear_pte(pte);
721                                 domain_flush_cache(domain, pte, sizeof(*pte));
722                         }
723                         tmp += level_size(level);
724                 }
725                 level++;
726         }
727         /* free pgd */
728         if (start == 0 && end >= ((((u64)1) << addr_width) - 1)) {
729                 free_pgtable_page(domain->pgd);
730                 domain->pgd = NULL;
731         }
732 }
733
734 /* iommu handling */
735 static int iommu_alloc_root_entry(struct intel_iommu *iommu)
736 {
737         struct root_entry *root;
738         unsigned long flags;
739
740         root = (struct root_entry *)alloc_pgtable_page();
741         if (!root)
742                 return -ENOMEM;
743
744         __iommu_flush_cache(iommu, root, ROOT_SIZE);
745
746         spin_lock_irqsave(&iommu->lock, flags);
747         iommu->root_entry = root;
748         spin_unlock_irqrestore(&iommu->lock, flags);
749
750         return 0;
751 }
752
753 static void iommu_set_root_entry(struct intel_iommu *iommu)
754 {
755         void *addr;
756         u32 cmd, sts;
757         unsigned long flag;
758
759         addr = iommu->root_entry;
760
761         spin_lock_irqsave(&iommu->register_lock, flag);
762         dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr));
763
764         cmd = iommu->gcmd | DMA_GCMD_SRTP;
765         writel(cmd, iommu->reg + DMAR_GCMD_REG);
766
767         /* Make sure hardware complete it */
768         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
769                 readl, (sts & DMA_GSTS_RTPS), sts);
770
771         spin_unlock_irqrestore(&iommu->register_lock, flag);
772 }
773
774 static void iommu_flush_write_buffer(struct intel_iommu *iommu)
775 {
776         u32 val;
777         unsigned long flag;
778
779         if (!cap_rwbf(iommu->cap))
780                 return;
781         val = iommu->gcmd | DMA_GCMD_WBF;
782
783         spin_lock_irqsave(&iommu->register_lock, flag);
784         writel(val, iommu->reg + DMAR_GCMD_REG);
785
786         /* Make sure hardware complete it */
787         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
788                         readl, (!(val & DMA_GSTS_WBFS)), val);
789
790         spin_unlock_irqrestore(&iommu->register_lock, flag);
791 }
792
793 /* return value determine if we need a write buffer flush */
794 static int __iommu_flush_context(struct intel_iommu *iommu,
795         u16 did, u16 source_id, u8 function_mask, u64 type,
796         int non_present_entry_flush)
797 {
798         u64 val = 0;
799         unsigned long flag;
800
801         /*
802          * In the non-present entry flush case, if hardware doesn't cache
803          * non-present entry we do nothing and if hardware cache non-present
804          * entry, we flush entries of domain 0 (the domain id is used to cache
805          * any non-present entries)
806          */
807         if (non_present_entry_flush) {
808                 if (!cap_caching_mode(iommu->cap))
809                         return 1;
810                 else
811                         did = 0;
812         }
813
814         switch (type) {
815         case DMA_CCMD_GLOBAL_INVL:
816                 val = DMA_CCMD_GLOBAL_INVL;
817                 break;
818         case DMA_CCMD_DOMAIN_INVL:
819                 val = DMA_CCMD_DOMAIN_INVL|DMA_CCMD_DID(did);
820                 break;
821         case DMA_CCMD_DEVICE_INVL:
822                 val = DMA_CCMD_DEVICE_INVL|DMA_CCMD_DID(did)
823                         | DMA_CCMD_SID(source_id) | DMA_CCMD_FM(function_mask);
824                 break;
825         default:
826                 BUG();
827         }
828         val |= DMA_CCMD_ICC;
829
830         spin_lock_irqsave(&iommu->register_lock, flag);
831         dmar_writeq(iommu->reg + DMAR_CCMD_REG, val);
832
833         /* Make sure hardware complete it */
834         IOMMU_WAIT_OP(iommu, DMAR_CCMD_REG,
835                 dmar_readq, (!(val & DMA_CCMD_ICC)), val);
836
837         spin_unlock_irqrestore(&iommu->register_lock, flag);
838
839         /* flush context entry will implicitly flush write buffer */
840         return 0;
841 }
842
843 /* return value determine if we need a write buffer flush */
844 static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did,
845         u64 addr, unsigned int size_order, u64 type,
846         int non_present_entry_flush)
847 {
848         int tlb_offset = ecap_iotlb_offset(iommu->ecap);
849         u64 val = 0, val_iva = 0;
850         unsigned long flag;
851
852         /*
853          * In the non-present entry flush case, if hardware doesn't cache
854          * non-present entry we do nothing and if hardware cache non-present
855          * entry, we flush entries of domain 0 (the domain id is used to cache
856          * any non-present entries)
857          */
858         if (non_present_entry_flush) {
859                 if (!cap_caching_mode(iommu->cap))
860                         return 1;
861                 else
862                         did = 0;
863         }
864
865         switch (type) {
866         case DMA_TLB_GLOBAL_FLUSH:
867                 /* global flush doesn't need set IVA_REG */
868                 val = DMA_TLB_GLOBAL_FLUSH|DMA_TLB_IVT;
869                 break;
870         case DMA_TLB_DSI_FLUSH:
871                 val = DMA_TLB_DSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
872                 break;
873         case DMA_TLB_PSI_FLUSH:
874                 val = DMA_TLB_PSI_FLUSH|DMA_TLB_IVT|DMA_TLB_DID(did);
875                 /* Note: always flush non-leaf currently */
876                 val_iva = size_order | addr;
877                 break;
878         default:
879                 BUG();
880         }
881         /* Note: set drain read/write */
882 #if 0
883         /*
884          * This is probably to be super secure.. Looks like we can
885          * ignore it without any impact.
886          */
887         if (cap_read_drain(iommu->cap))
888                 val |= DMA_TLB_READ_DRAIN;
889 #endif
890         if (cap_write_drain(iommu->cap))
891                 val |= DMA_TLB_WRITE_DRAIN;
892
893         spin_lock_irqsave(&iommu->register_lock, flag);
894         /* Note: Only uses first TLB reg currently */
895         if (val_iva)
896                 dmar_writeq(iommu->reg + tlb_offset, val_iva);
897         dmar_writeq(iommu->reg + tlb_offset + 8, val);
898
899         /* Make sure hardware complete it */
900         IOMMU_WAIT_OP(iommu, tlb_offset + 8,
901                 dmar_readq, (!(val & DMA_TLB_IVT)), val);
902
903         spin_unlock_irqrestore(&iommu->register_lock, flag);
904
905         /* check IOTLB invalidation granularity */
906         if (DMA_TLB_IAIG(val) == 0)
907                 printk(KERN_ERR"IOMMU: flush IOTLB failed\n");
908         if (DMA_TLB_IAIG(val) != DMA_TLB_IIRG(type))
909                 pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n",
910                         (unsigned long long)DMA_TLB_IIRG(type),
911                         (unsigned long long)DMA_TLB_IAIG(val));
912         /* flush iotlb entry will implicitly flush write buffer */
913         return 0;
914 }
915
916 static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did,
917         u64 addr, unsigned int pages, int non_present_entry_flush)
918 {
919         unsigned int mask;
920
921         BUG_ON(addr & (~VTD_PAGE_MASK));
922         BUG_ON(pages == 0);
923
924         /* Fallback to domain selective flush if no PSI support */
925         if (!cap_pgsel_inv(iommu->cap))
926                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
927                                                 DMA_TLB_DSI_FLUSH,
928                                                 non_present_entry_flush);
929
930         /*
931          * PSI requires page size to be 2 ^ x, and the base address is naturally
932          * aligned to the size
933          */
934         mask = ilog2(__roundup_pow_of_two(pages));
935         /* Fallback to domain selective flush if size is too big */
936         if (mask > cap_max_amask_val(iommu->cap))
937                 return iommu->flush.flush_iotlb(iommu, did, 0, 0,
938                         DMA_TLB_DSI_FLUSH, non_present_entry_flush);
939
940         return iommu->flush.flush_iotlb(iommu, did, addr, mask,
941                                         DMA_TLB_PSI_FLUSH,
942                                         non_present_entry_flush);
943 }
944
945 static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu)
946 {
947         u32 pmen;
948         unsigned long flags;
949
950         spin_lock_irqsave(&iommu->register_lock, flags);
951         pmen = readl(iommu->reg + DMAR_PMEN_REG);
952         pmen &= ~DMA_PMEN_EPM;
953         writel(pmen, iommu->reg + DMAR_PMEN_REG);
954
955         /* wait for the protected region status bit to clear */
956         IOMMU_WAIT_OP(iommu, DMAR_PMEN_REG,
957                 readl, !(pmen & DMA_PMEN_PRS), pmen);
958
959         spin_unlock_irqrestore(&iommu->register_lock, flags);
960 }
961
962 static int iommu_enable_translation(struct intel_iommu *iommu)
963 {
964         u32 sts;
965         unsigned long flags;
966
967         spin_lock_irqsave(&iommu->register_lock, flags);
968         writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG);
969
970         /* Make sure hardware complete it */
971         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
972                 readl, (sts & DMA_GSTS_TES), sts);
973
974         iommu->gcmd |= DMA_GCMD_TE;
975         spin_unlock_irqrestore(&iommu->register_lock, flags);
976         return 0;
977 }
978
979 static int iommu_disable_translation(struct intel_iommu *iommu)
980 {
981         u32 sts;
982         unsigned long flag;
983
984         spin_lock_irqsave(&iommu->register_lock, flag);
985         iommu->gcmd &= ~DMA_GCMD_TE;
986         writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG);
987
988         /* Make sure hardware complete it */
989         IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
990                 readl, (!(sts & DMA_GSTS_TES)), sts);
991
992         spin_unlock_irqrestore(&iommu->register_lock, flag);
993         return 0;
994 }
995
996 /* iommu interrupt handling. Most stuff are MSI-like. */
997
998 static const char *fault_reason_strings[] =
999 {
1000         "Software",
1001         "Present bit in root entry is clear",
1002         "Present bit in context entry is clear",
1003         "Invalid context entry",
1004         "Access beyond MGAW",
1005         "PTE Write access is not set",
1006         "PTE Read access is not set",
1007         "Next page table ptr is invalid",
1008         "Root table address invalid",
1009         "Context table ptr is invalid",
1010         "non-zero reserved fields in RTP",
1011         "non-zero reserved fields in CTP",
1012         "non-zero reserved fields in PTE",
1013 };
1014 #define MAX_FAULT_REASON_IDX    (ARRAY_SIZE(fault_reason_strings) - 1)
1015
1016 const char *dmar_get_fault_reason(u8 fault_reason)
1017 {
1018         if (fault_reason > MAX_FAULT_REASON_IDX)
1019                 return "Unknown";
1020         else
1021                 return fault_reason_strings[fault_reason];
1022 }
1023
1024 void dmar_msi_unmask(unsigned int irq)
1025 {
1026         struct intel_iommu *iommu = get_irq_data(irq);
1027         unsigned long flag;
1028
1029         /* unmask it */
1030         spin_lock_irqsave(&iommu->register_lock, flag);
1031         writel(0, iommu->reg + DMAR_FECTL_REG);
1032         /* Read a reg to force flush the post write */
1033         readl(iommu->reg + DMAR_FECTL_REG);
1034         spin_unlock_irqrestore(&iommu->register_lock, flag);
1035 }
1036
1037 void dmar_msi_mask(unsigned int irq)
1038 {
1039         unsigned long flag;
1040         struct intel_iommu *iommu = get_irq_data(irq);
1041
1042         /* mask it */
1043         spin_lock_irqsave(&iommu->register_lock, flag);
1044         writel(DMA_FECTL_IM, iommu->reg + DMAR_FECTL_REG);
1045         /* Read a reg to force flush the post write */
1046         readl(iommu->reg + DMAR_FECTL_REG);
1047         spin_unlock_irqrestore(&iommu->register_lock, flag);
1048 }
1049
1050 void dmar_msi_write(int irq, struct msi_msg *msg)
1051 {
1052         struct intel_iommu *iommu = get_irq_data(irq);
1053         unsigned long flag;
1054
1055         spin_lock_irqsave(&iommu->register_lock, flag);
1056         writel(msg->data, iommu->reg + DMAR_FEDATA_REG);
1057         writel(msg->address_lo, iommu->reg + DMAR_FEADDR_REG);
1058         writel(msg->address_hi, iommu->reg + DMAR_FEUADDR_REG);
1059         spin_unlock_irqrestore(&iommu->register_lock, flag);
1060 }
1061
1062 void dmar_msi_read(int irq, struct msi_msg *msg)
1063 {
1064         struct intel_iommu *iommu = get_irq_data(irq);
1065         unsigned long flag;
1066
1067         spin_lock_irqsave(&iommu->register_lock, flag);
1068         msg->data = readl(iommu->reg + DMAR_FEDATA_REG);
1069         msg->address_lo = readl(iommu->reg + DMAR_FEADDR_REG);
1070         msg->address_hi = readl(iommu->reg + DMAR_FEUADDR_REG);
1071         spin_unlock_irqrestore(&iommu->register_lock, flag);
1072 }
1073
1074 static int iommu_page_fault_do_one(struct intel_iommu *iommu, int type,
1075                 u8 fault_reason, u16 source_id, unsigned long long addr)
1076 {
1077         const char *reason;
1078
1079         reason = dmar_get_fault_reason(fault_reason);
1080
1081         printk(KERN_ERR
1082                 "DMAR:[%s] Request device [%02x:%02x.%d] "
1083                 "fault addr %llx \n"
1084                 "DMAR:[fault reason %02d] %s\n",
1085                 (type ? "DMA Read" : "DMA Write"),
1086                 (source_id >> 8), PCI_SLOT(source_id & 0xFF),
1087                 PCI_FUNC(source_id & 0xFF), addr, fault_reason, reason);
1088         return 0;
1089 }
1090
1091 #define PRIMARY_FAULT_REG_LEN (16)
1092 static irqreturn_t iommu_page_fault(int irq, void *dev_id)
1093 {
1094         struct intel_iommu *iommu = dev_id;
1095         int reg, fault_index;
1096         u32 fault_status;
1097         unsigned long flag;
1098
1099         spin_lock_irqsave(&iommu->register_lock, flag);
1100         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1101
1102         /* TBD: ignore advanced fault log currently */
1103         if (!(fault_status & DMA_FSTS_PPF))
1104                 goto clear_overflow;
1105
1106         fault_index = dma_fsts_fault_record_index(fault_status);
1107         reg = cap_fault_reg_offset(iommu->cap);
1108         while (1) {
1109                 u8 fault_reason;
1110                 u16 source_id;
1111                 u64 guest_addr;
1112                 int type;
1113                 u32 data;
1114
1115                 /* highest 32 bits */
1116                 data = readl(iommu->reg + reg +
1117                                 fault_index * PRIMARY_FAULT_REG_LEN + 12);
1118                 if (!(data & DMA_FRCD_F))
1119                         break;
1120
1121                 fault_reason = dma_frcd_fault_reason(data);
1122                 type = dma_frcd_type(data);
1123
1124                 data = readl(iommu->reg + reg +
1125                                 fault_index * PRIMARY_FAULT_REG_LEN + 8);
1126                 source_id = dma_frcd_source_id(data);
1127
1128                 guest_addr = dmar_readq(iommu->reg + reg +
1129                                 fault_index * PRIMARY_FAULT_REG_LEN);
1130                 guest_addr = dma_frcd_page_addr(guest_addr);
1131                 /* clear the fault */
1132                 writel(DMA_FRCD_F, iommu->reg + reg +
1133                         fault_index * PRIMARY_FAULT_REG_LEN + 12);
1134
1135                 spin_unlock_irqrestore(&iommu->register_lock, flag);
1136
1137                 iommu_page_fault_do_one(iommu, type, fault_reason,
1138                                 source_id, guest_addr);
1139
1140                 fault_index++;
1141                 if (fault_index > cap_num_fault_regs(iommu->cap))
1142                         fault_index = 0;
1143                 spin_lock_irqsave(&iommu->register_lock, flag);
1144         }
1145 clear_overflow:
1146         /* clear primary fault overflow */
1147         fault_status = readl(iommu->reg + DMAR_FSTS_REG);
1148         if (fault_status & DMA_FSTS_PFO)
1149                 writel(DMA_FSTS_PFO, iommu->reg + DMAR_FSTS_REG);
1150
1151         spin_unlock_irqrestore(&iommu->register_lock, flag);
1152         return IRQ_HANDLED;
1153 }
1154
1155 int dmar_set_interrupt(struct intel_iommu *iommu)
1156 {
1157         int irq, ret;
1158
1159         irq = create_irq();
1160         if (!irq) {
1161                 printk(KERN_ERR "IOMMU: no free vectors\n");
1162                 return -EINVAL;
1163         }
1164
1165         set_irq_data(irq, iommu);
1166         iommu->irq = irq;
1167
1168         ret = arch_setup_dmar_msi(irq);
1169         if (ret) {
1170                 set_irq_data(irq, NULL);
1171                 iommu->irq = 0;
1172                 destroy_irq(irq);
1173                 return 0;
1174         }
1175
1176         /* Force fault register is cleared */
1177         iommu_page_fault(irq, iommu);
1178
1179         ret = request_irq(irq, iommu_page_fault, 0, iommu->name, iommu);
1180         if (ret)
1181                 printk(KERN_ERR "IOMMU: can't request irq\n");
1182         return ret;
1183 }
1184
1185 static int iommu_init_domains(struct intel_iommu *iommu)
1186 {
1187         unsigned long ndomains;
1188         unsigned long nlongs;
1189
1190         ndomains = cap_ndoms(iommu->cap);
1191         pr_debug("Number of Domains supportd <%ld>\n", ndomains);
1192         nlongs = BITS_TO_LONGS(ndomains);
1193
1194         /* TBD: there might be 64K domains,
1195          * consider other allocation for future chip
1196          */
1197         iommu->domain_ids = kcalloc(nlongs, sizeof(unsigned long), GFP_KERNEL);
1198         if (!iommu->domain_ids) {
1199                 printk(KERN_ERR "Allocating domain id array failed\n");
1200                 return -ENOMEM;
1201         }
1202         iommu->domains = kcalloc(ndomains, sizeof(struct dmar_domain *),
1203                         GFP_KERNEL);
1204         if (!iommu->domains) {
1205                 printk(KERN_ERR "Allocating domain array failed\n");
1206                 kfree(iommu->domain_ids);
1207                 return -ENOMEM;
1208         }
1209
1210         spin_lock_init(&iommu->lock);
1211
1212         /*
1213          * if Caching mode is set, then invalid translations are tagged
1214          * with domainid 0. Hence we need to pre-allocate it.
1215          */
1216         if (cap_caching_mode(iommu->cap))
1217                 set_bit(0, iommu->domain_ids);
1218         return 0;
1219 }
1220
1221
1222 static void domain_exit(struct dmar_domain *domain);
1223 static void vm_domain_exit(struct dmar_domain *domain);
1224
1225 void free_dmar_iommu(struct intel_iommu *iommu)
1226 {
1227         struct dmar_domain *domain;
1228         int i;
1229         unsigned long flags;
1230
1231         i = find_first_bit(iommu->domain_ids, cap_ndoms(iommu->cap));
1232         for (; i < cap_ndoms(iommu->cap); ) {
1233                 domain = iommu->domains[i];
1234                 clear_bit(i, iommu->domain_ids);
1235
1236                 spin_lock_irqsave(&domain->iommu_lock, flags);
1237                 if (--domain->iommu_count == 0) {
1238                         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
1239                                 vm_domain_exit(domain);
1240                         else
1241                                 domain_exit(domain);
1242                 }
1243                 spin_unlock_irqrestore(&domain->iommu_lock, flags);
1244
1245                 i = find_next_bit(iommu->domain_ids,
1246                         cap_ndoms(iommu->cap), i+1);
1247         }
1248
1249         if (iommu->gcmd & DMA_GCMD_TE)
1250                 iommu_disable_translation(iommu);
1251
1252         if (iommu->irq) {
1253                 set_irq_data(iommu->irq, NULL);
1254                 /* This will mask the irq */
1255                 free_irq(iommu->irq, iommu);
1256                 destroy_irq(iommu->irq);
1257         }
1258
1259         kfree(iommu->domains);
1260         kfree(iommu->domain_ids);
1261
1262         g_iommus[iommu->seq_id] = NULL;
1263
1264         /* if all iommus are freed, free g_iommus */
1265         for (i = 0; i < g_num_of_iommus; i++) {
1266                 if (g_iommus[i])
1267                         break;
1268         }
1269
1270         if (i == g_num_of_iommus)
1271                 kfree(g_iommus);
1272
1273         /* free context mapping */
1274         free_context_table(iommu);
1275 }
1276
1277 static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu)
1278 {
1279         unsigned long num;
1280         unsigned long ndomains;
1281         struct dmar_domain *domain;
1282         unsigned long flags;
1283
1284         domain = alloc_domain_mem();
1285         if (!domain)
1286                 return NULL;
1287
1288         ndomains = cap_ndoms(iommu->cap);
1289
1290         spin_lock_irqsave(&iommu->lock, flags);
1291         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1292         if (num >= ndomains) {
1293                 spin_unlock_irqrestore(&iommu->lock, flags);
1294                 free_domain_mem(domain);
1295                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1296                 return NULL;
1297         }
1298
1299         set_bit(num, iommu->domain_ids);
1300         domain->id = num;
1301         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
1302         set_bit(iommu->seq_id, &domain->iommu_bmp);
1303         domain->flags = 0;
1304         iommu->domains[num] = domain;
1305         spin_unlock_irqrestore(&iommu->lock, flags);
1306
1307         return domain;
1308 }
1309
1310 static void iommu_free_domain(struct dmar_domain *domain)
1311 {
1312         unsigned long flags;
1313         struct intel_iommu *iommu;
1314
1315         iommu = domain_get_iommu(domain);
1316
1317         spin_lock_irqsave(&iommu->lock, flags);
1318         clear_bit(domain->id, iommu->domain_ids);
1319         spin_unlock_irqrestore(&iommu->lock, flags);
1320 }
1321
1322 static struct iova_domain reserved_iova_list;
1323 static struct lock_class_key reserved_alloc_key;
1324 static struct lock_class_key reserved_rbtree_key;
1325
1326 static void dmar_init_reserved_ranges(void)
1327 {
1328         struct pci_dev *pdev = NULL;
1329         struct iova *iova;
1330         int i;
1331         u64 addr, size;
1332
1333         init_iova_domain(&reserved_iova_list, DMA_32BIT_PFN);
1334
1335         lockdep_set_class(&reserved_iova_list.iova_alloc_lock,
1336                 &reserved_alloc_key);
1337         lockdep_set_class(&reserved_iova_list.iova_rbtree_lock,
1338                 &reserved_rbtree_key);
1339
1340         /* IOAPIC ranges shouldn't be accessed by DMA */
1341         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(IOAPIC_RANGE_START),
1342                 IOVA_PFN(IOAPIC_RANGE_END));
1343         if (!iova)
1344                 printk(KERN_ERR "Reserve IOAPIC range failed\n");
1345
1346         /* Reserve all PCI MMIO to avoid peer-to-peer access */
1347         for_each_pci_dev(pdev) {
1348                 struct resource *r;
1349
1350                 for (i = 0; i < PCI_NUM_RESOURCES; i++) {
1351                         r = &pdev->resource[i];
1352                         if (!r->flags || !(r->flags & IORESOURCE_MEM))
1353                                 continue;
1354                         addr = r->start;
1355                         addr &= PAGE_MASK;
1356                         size = r->end - addr;
1357                         size = PAGE_ALIGN(size);
1358                         iova = reserve_iova(&reserved_iova_list, IOVA_PFN(addr),
1359                                 IOVA_PFN(size + addr) - 1);
1360                         if (!iova)
1361                                 printk(KERN_ERR "Reserve iova failed\n");
1362                 }
1363         }
1364
1365 }
1366
1367 static void domain_reserve_special_ranges(struct dmar_domain *domain)
1368 {
1369         copy_reserved_iova(&reserved_iova_list, &domain->iovad);
1370 }
1371
1372 static inline int guestwidth_to_adjustwidth(int gaw)
1373 {
1374         int agaw;
1375         int r = (gaw - 12) % 9;
1376
1377         if (r == 0)
1378                 agaw = gaw;
1379         else
1380                 agaw = gaw + 9 - r;
1381         if (agaw > 64)
1382                 agaw = 64;
1383         return agaw;
1384 }
1385
1386 static int domain_init(struct dmar_domain *domain, int guest_width)
1387 {
1388         struct intel_iommu *iommu;
1389         int adjust_width, agaw;
1390         unsigned long sagaw;
1391
1392         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
1393         spin_lock_init(&domain->mapping_lock);
1394         spin_lock_init(&domain->iommu_lock);
1395
1396         domain_reserve_special_ranges(domain);
1397
1398         /* calculate AGAW */
1399         iommu = domain_get_iommu(domain);
1400         if (guest_width > cap_mgaw(iommu->cap))
1401                 guest_width = cap_mgaw(iommu->cap);
1402         domain->gaw = guest_width;
1403         adjust_width = guestwidth_to_adjustwidth(guest_width);
1404         agaw = width_to_agaw(adjust_width);
1405         sagaw = cap_sagaw(iommu->cap);
1406         if (!test_bit(agaw, &sagaw)) {
1407                 /* hardware doesn't support it, choose a bigger one */
1408                 pr_debug("IOMMU: hardware doesn't support agaw %d\n", agaw);
1409                 agaw = find_next_bit(&sagaw, 5, agaw);
1410                 if (agaw >= 5)
1411                         return -ENODEV;
1412         }
1413         domain->agaw = agaw;
1414         INIT_LIST_HEAD(&domain->devices);
1415
1416         if (ecap_coherent(iommu->ecap))
1417                 domain->iommu_coherency = 1;
1418         else
1419                 domain->iommu_coherency = 0;
1420
1421         domain->iommu_count = 1;
1422
1423         /* always allocate the top pgd */
1424         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
1425         if (!domain->pgd)
1426                 return -ENOMEM;
1427         __iommu_flush_cache(iommu, domain->pgd, PAGE_SIZE);
1428         return 0;
1429 }
1430
1431 static void domain_exit(struct dmar_domain *domain)
1432 {
1433         u64 end;
1434
1435         /* Domain 0 is reserved, so dont process it */
1436         if (!domain)
1437                 return;
1438
1439         domain_remove_dev_info(domain);
1440         /* destroy iovas */
1441         put_iova_domain(&domain->iovad);
1442         end = DOMAIN_MAX_ADDR(domain->gaw);
1443         end = end & (~PAGE_MASK);
1444
1445         /* clear ptes */
1446         dma_pte_clear_range(domain, 0, end);
1447
1448         /* free page tables */
1449         dma_pte_free_pagetable(domain, 0, end);
1450
1451         iommu_free_domain(domain);
1452         free_domain_mem(domain);
1453 }
1454
1455 static int domain_context_mapping_one(struct dmar_domain *domain,
1456                 u8 bus, u8 devfn)
1457 {
1458         struct context_entry *context;
1459         unsigned long flags;
1460         struct intel_iommu *iommu;
1461         struct dma_pte *pgd;
1462         unsigned long num;
1463         unsigned long ndomains;
1464         int id;
1465         int agaw;
1466
1467         pr_debug("Set context mapping for %02x:%02x.%d\n",
1468                 bus, PCI_SLOT(devfn), PCI_FUNC(devfn));
1469         BUG_ON(!domain->pgd);
1470
1471         iommu = device_to_iommu(bus, devfn);
1472         if (!iommu)
1473                 return -ENODEV;
1474
1475         context = device_to_context_entry(iommu, bus, devfn);
1476         if (!context)
1477                 return -ENOMEM;
1478         spin_lock_irqsave(&iommu->lock, flags);
1479         if (context_present(context)) {
1480                 spin_unlock_irqrestore(&iommu->lock, flags);
1481                 return 0;
1482         }
1483
1484         id = domain->id;
1485         pgd = domain->pgd;
1486
1487         if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) {
1488                 int found = 0;
1489
1490                 /* find an available domain id for this device in iommu */
1491                 ndomains = cap_ndoms(iommu->cap);
1492                 num = find_first_bit(iommu->domain_ids, ndomains);
1493                 for (; num < ndomains; ) {
1494                         if (iommu->domains[num] == domain) {
1495                                 id = num;
1496                                 found = 1;
1497                                 break;
1498                         }
1499                         num = find_next_bit(iommu->domain_ids,
1500                                             cap_ndoms(iommu->cap), num+1);
1501                 }
1502
1503                 if (found == 0) {
1504                         num = find_first_zero_bit(iommu->domain_ids, ndomains);
1505                         if (num >= ndomains) {
1506                                 spin_unlock_irqrestore(&iommu->lock, flags);
1507                                 printk(KERN_ERR "IOMMU: no free domain ids\n");
1508                                 return -EFAULT;
1509                         }
1510
1511                         set_bit(num, iommu->domain_ids);
1512                         iommu->domains[num] = domain;
1513                         id = num;
1514                 }
1515
1516                 /* Skip top levels of page tables for
1517                  * iommu which has less agaw than default.
1518                  */
1519                 for (agaw = domain->agaw; agaw != iommu->agaw; agaw--) {
1520                         pgd = phys_to_virt(dma_pte_addr(pgd));
1521                         if (!dma_pte_present(pgd)) {
1522                                 spin_unlock_irqrestore(&iommu->lock, flags);
1523                                 return -ENOMEM;
1524                         }
1525                 }
1526         }
1527
1528         context_set_domain_id(context, id);
1529         context_set_address_width(context, iommu->agaw);
1530         context_set_address_root(context, virt_to_phys(pgd));
1531         context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL);
1532         context_set_fault_enable(context);
1533         context_set_present(context);
1534         domain_flush_cache(domain, context, sizeof(*context));
1535
1536         /* it's a non-present to present mapping */
1537         if (iommu->flush.flush_context(iommu, domain->id,
1538                 (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT,
1539                 DMA_CCMD_DEVICE_INVL, 1))
1540                 iommu_flush_write_buffer(iommu);
1541         else
1542                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0);
1543
1544         spin_unlock_irqrestore(&iommu->lock, flags);
1545
1546         spin_lock_irqsave(&domain->iommu_lock, flags);
1547         if (!test_and_set_bit(iommu->seq_id, &domain->iommu_bmp)) {
1548                 domain->iommu_count++;
1549                 domain_update_iommu_coherency(domain);
1550         }
1551         spin_unlock_irqrestore(&domain->iommu_lock, flags);
1552         return 0;
1553 }
1554
1555 static int
1556 domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev)
1557 {
1558         int ret;
1559         struct pci_dev *tmp, *parent;
1560
1561         ret = domain_context_mapping_one(domain, pdev->bus->number,
1562                 pdev->devfn);
1563         if (ret)
1564                 return ret;
1565
1566         /* dependent device mapping */
1567         tmp = pci_find_upstream_pcie_bridge(pdev);
1568         if (!tmp)
1569                 return 0;
1570         /* Secondary interface's bus number and devfn 0 */
1571         parent = pdev->bus->self;
1572         while (parent != tmp) {
1573                 ret = domain_context_mapping_one(domain, parent->bus->number,
1574                         parent->devfn);
1575                 if (ret)
1576                         return ret;
1577                 parent = parent->bus->self;
1578         }
1579         if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */
1580                 return domain_context_mapping_one(domain,
1581                         tmp->subordinate->number, 0);
1582         else /* this is a legacy PCI bridge */
1583                 return domain_context_mapping_one(domain,
1584                         tmp->bus->number, tmp->devfn);
1585 }
1586
1587 static int domain_context_mapped(struct pci_dev *pdev)
1588 {
1589         int ret;
1590         struct pci_dev *tmp, *parent;
1591         struct intel_iommu *iommu;
1592
1593         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
1594         if (!iommu)
1595                 return -ENODEV;
1596
1597         ret = device_context_mapped(iommu,
1598                 pdev->bus->number, pdev->devfn);
1599         if (!ret)
1600                 return ret;
1601         /* dependent device mapping */
1602         tmp = pci_find_upstream_pcie_bridge(pdev);
1603         if (!tmp)
1604                 return ret;
1605         /* Secondary interface's bus number and devfn 0 */
1606         parent = pdev->bus->self;
1607         while (parent != tmp) {
1608                 ret = device_context_mapped(iommu, parent->bus->number,
1609                         parent->devfn);
1610                 if (!ret)
1611                         return ret;
1612                 parent = parent->bus->self;
1613         }
1614         if (tmp->is_pcie)
1615                 return device_context_mapped(iommu,
1616                         tmp->subordinate->number, 0);
1617         else
1618                 return device_context_mapped(iommu,
1619                         tmp->bus->number, tmp->devfn);
1620 }
1621
1622 static int
1623 domain_page_mapping(struct dmar_domain *domain, dma_addr_t iova,
1624                         u64 hpa, size_t size, int prot)
1625 {
1626         u64 start_pfn, end_pfn;
1627         struct dma_pte *pte;
1628         int index;
1629         int addr_width = agaw_to_width(domain->agaw);
1630
1631         hpa &= (((u64)1) << addr_width) - 1;
1632
1633         if ((prot & (DMA_PTE_READ|DMA_PTE_WRITE)) == 0)
1634                 return -EINVAL;
1635         iova &= PAGE_MASK;
1636         start_pfn = ((u64)hpa) >> VTD_PAGE_SHIFT;
1637         end_pfn = (VTD_PAGE_ALIGN(((u64)hpa) + size)) >> VTD_PAGE_SHIFT;
1638         index = 0;
1639         while (start_pfn < end_pfn) {
1640                 pte = addr_to_dma_pte(domain, iova + VTD_PAGE_SIZE * index);
1641                 if (!pte)
1642                         return -ENOMEM;
1643                 /* We don't need lock here, nobody else
1644                  * touches the iova range
1645                  */
1646                 BUG_ON(dma_pte_addr(pte));
1647                 dma_set_pte_addr(pte, start_pfn << VTD_PAGE_SHIFT);
1648                 dma_set_pte_prot(pte, prot);
1649                 domain_flush_cache(domain, pte, sizeof(*pte));
1650                 start_pfn++;
1651                 index++;
1652         }
1653         return 0;
1654 }
1655
1656 static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn)
1657 {
1658         if (!iommu)
1659                 return;
1660
1661         clear_context_table(iommu, bus, devfn);
1662         iommu->flush.flush_context(iommu, 0, 0, 0,
1663                                            DMA_CCMD_GLOBAL_INVL, 0);
1664         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
1665                                          DMA_TLB_GLOBAL_FLUSH, 0);
1666 }
1667
1668 static void domain_remove_dev_info(struct dmar_domain *domain)
1669 {
1670         struct device_domain_info *info;
1671         unsigned long flags;
1672         struct intel_iommu *iommu;
1673
1674         spin_lock_irqsave(&device_domain_lock, flags);
1675         while (!list_empty(&domain->devices)) {
1676                 info = list_entry(domain->devices.next,
1677                         struct device_domain_info, link);
1678                 list_del(&info->link);
1679                 list_del(&info->global);
1680                 if (info->dev)
1681                         info->dev->dev.archdata.iommu = NULL;
1682                 spin_unlock_irqrestore(&device_domain_lock, flags);
1683
1684                 iommu = device_to_iommu(info->bus, info->devfn);
1685                 iommu_detach_dev(iommu, info->bus, info->devfn);
1686                 free_devinfo_mem(info);
1687
1688                 spin_lock_irqsave(&device_domain_lock, flags);
1689         }
1690         spin_unlock_irqrestore(&device_domain_lock, flags);
1691 }
1692
1693 /*
1694  * find_domain
1695  * Note: we use struct pci_dev->dev.archdata.iommu stores the info
1696  */
1697 static struct dmar_domain *
1698 find_domain(struct pci_dev *pdev)
1699 {
1700         struct device_domain_info *info;
1701
1702         /* No lock here, assumes no domain exit in normal case */
1703         info = pdev->dev.archdata.iommu;
1704         if (info)
1705                 return info->domain;
1706         return NULL;
1707 }
1708
1709 /* domain is initialized */
1710 static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw)
1711 {
1712         struct dmar_domain *domain, *found = NULL;
1713         struct intel_iommu *iommu;
1714         struct dmar_drhd_unit *drhd;
1715         struct device_domain_info *info, *tmp;
1716         struct pci_dev *dev_tmp;
1717         unsigned long flags;
1718         int bus = 0, devfn = 0;
1719
1720         domain = find_domain(pdev);
1721         if (domain)
1722                 return domain;
1723
1724         dev_tmp = pci_find_upstream_pcie_bridge(pdev);
1725         if (dev_tmp) {
1726                 if (dev_tmp->is_pcie) {
1727                         bus = dev_tmp->subordinate->number;
1728                         devfn = 0;
1729                 } else {
1730                         bus = dev_tmp->bus->number;
1731                         devfn = dev_tmp->devfn;
1732                 }
1733                 spin_lock_irqsave(&device_domain_lock, flags);
1734                 list_for_each_entry(info, &device_domain_list, global) {
1735                         if (info->bus == bus && info->devfn == devfn) {
1736                                 found = info->domain;
1737                                 break;
1738                         }
1739                 }
1740                 spin_unlock_irqrestore(&device_domain_lock, flags);
1741                 /* pcie-pci bridge already has a domain, uses it */
1742                 if (found) {
1743                         domain = found;
1744                         goto found_domain;
1745                 }
1746         }
1747
1748         /* Allocate new domain for the device */
1749         drhd = dmar_find_matched_drhd_unit(pdev);
1750         if (!drhd) {
1751                 printk(KERN_ERR "IOMMU: can't find DMAR for device %s\n",
1752                         pci_name(pdev));
1753                 return NULL;
1754         }
1755         iommu = drhd->iommu;
1756
1757         domain = iommu_alloc_domain(iommu);
1758         if (!domain)
1759                 goto error;
1760
1761         if (domain_init(domain, gaw)) {
1762                 domain_exit(domain);
1763                 goto error;
1764         }
1765
1766         /* register pcie-to-pci device */
1767         if (dev_tmp) {
1768                 info = alloc_devinfo_mem();
1769                 if (!info) {
1770                         domain_exit(domain);
1771                         goto error;
1772                 }
1773                 info->bus = bus;
1774                 info->devfn = devfn;
1775                 info->dev = NULL;
1776                 info->domain = domain;
1777                 /* This domain is shared by devices under p2p bridge */
1778                 domain->flags |= DOMAIN_FLAG_P2P_MULTIPLE_DEVICES;
1779
1780                 /* pcie-to-pci bridge already has a domain, uses it */
1781                 found = NULL;
1782                 spin_lock_irqsave(&device_domain_lock, flags);
1783                 list_for_each_entry(tmp, &device_domain_list, global) {
1784                         if (tmp->bus == bus && tmp->devfn == devfn) {
1785                                 found = tmp->domain;
1786                                 break;
1787                         }
1788                 }
1789                 if (found) {
1790                         free_devinfo_mem(info);
1791                         domain_exit(domain);
1792                         domain = found;
1793                 } else {
1794                         list_add(&info->link, &domain->devices);
1795                         list_add(&info->global, &device_domain_list);
1796                 }
1797                 spin_unlock_irqrestore(&device_domain_lock, flags);
1798         }
1799
1800 found_domain:
1801         info = alloc_devinfo_mem();
1802         if (!info)
1803                 goto error;
1804         info->bus = pdev->bus->number;
1805         info->devfn = pdev->devfn;
1806         info->dev = pdev;
1807         info->domain = domain;
1808         spin_lock_irqsave(&device_domain_lock, flags);
1809         /* somebody is fast */
1810         found = find_domain(pdev);
1811         if (found != NULL) {
1812                 spin_unlock_irqrestore(&device_domain_lock, flags);
1813                 if (found != domain) {
1814                         domain_exit(domain);
1815                         domain = found;
1816                 }
1817                 free_devinfo_mem(info);
1818                 return domain;
1819         }
1820         list_add(&info->link, &domain->devices);
1821         list_add(&info->global, &device_domain_list);
1822         pdev->dev.archdata.iommu = info;
1823         spin_unlock_irqrestore(&device_domain_lock, flags);
1824         return domain;
1825 error:
1826         /* recheck it here, maybe others set it */
1827         return find_domain(pdev);
1828 }
1829
1830 static int iommu_prepare_identity_map(struct pci_dev *pdev,
1831                                       unsigned long long start,
1832                                       unsigned long long end)
1833 {
1834         struct dmar_domain *domain;
1835         unsigned long size;
1836         unsigned long long base;
1837         int ret;
1838
1839         printk(KERN_INFO
1840                 "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n",
1841                 pci_name(pdev), start, end);
1842         /* page table init */
1843         domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH);
1844         if (!domain)
1845                 return -ENOMEM;
1846
1847         /* The address might not be aligned */
1848         base = start & PAGE_MASK;
1849         size = end - base;
1850         size = PAGE_ALIGN(size);
1851         if (!reserve_iova(&domain->iovad, IOVA_PFN(base),
1852                         IOVA_PFN(base + size) - 1)) {
1853                 printk(KERN_ERR "IOMMU: reserve iova failed\n");
1854                 ret = -ENOMEM;
1855                 goto error;
1856         }
1857
1858         pr_debug("Mapping reserved region %lx@%llx for %s\n",
1859                 size, base, pci_name(pdev));
1860         /*
1861          * RMRR range might have overlap with physical memory range,
1862          * clear it first
1863          */
1864         dma_pte_clear_range(domain, base, base + size);
1865
1866         ret = domain_page_mapping(domain, base, base, size,
1867                 DMA_PTE_READ|DMA_PTE_WRITE);
1868         if (ret)
1869                 goto error;
1870
1871         /* context entry init */
1872         ret = domain_context_mapping(domain, pdev);
1873         if (!ret)
1874                 return 0;
1875 error:
1876         domain_exit(domain);
1877         return ret;
1878
1879 }
1880
1881 static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr,
1882         struct pci_dev *pdev)
1883 {
1884         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
1885                 return 0;
1886         return iommu_prepare_identity_map(pdev, rmrr->base_address,
1887                 rmrr->end_address + 1);
1888 }
1889
1890 #ifdef CONFIG_DMAR_GFX_WA
1891 struct iommu_prepare_data {
1892         struct pci_dev *pdev;
1893         int ret;
1894 };
1895
1896 static int __init iommu_prepare_work_fn(unsigned long start_pfn,
1897                                          unsigned long end_pfn, void *datax)
1898 {
1899         struct iommu_prepare_data *data;
1900
1901         data = (struct iommu_prepare_data *)datax;
1902
1903         data->ret = iommu_prepare_identity_map(data->pdev,
1904                                 start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
1905         return data->ret;
1906
1907 }
1908
1909 static int __init iommu_prepare_with_active_regions(struct pci_dev *pdev)
1910 {
1911         int nid;
1912         struct iommu_prepare_data data;
1913
1914         data.pdev = pdev;
1915         data.ret = 0;
1916
1917         for_each_online_node(nid) {
1918                 work_with_active_regions(nid, iommu_prepare_work_fn, &data);
1919                 if (data.ret)
1920                         return data.ret;
1921         }
1922         return data.ret;
1923 }
1924
1925 static void __init iommu_prepare_gfx_mapping(void)
1926 {
1927         struct pci_dev *pdev = NULL;
1928         int ret;
1929
1930         for_each_pci_dev(pdev) {
1931                 if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO ||
1932                                 !IS_GFX_DEVICE(pdev))
1933                         continue;
1934                 printk(KERN_INFO "IOMMU: gfx device %s 1-1 mapping\n",
1935                         pci_name(pdev));
1936                 ret = iommu_prepare_with_active_regions(pdev);
1937                 if (ret)
1938                         printk(KERN_ERR "IOMMU: mapping reserved region failed\n");
1939         }
1940 }
1941 #else /* !CONFIG_DMAR_GFX_WA */
1942 static inline void iommu_prepare_gfx_mapping(void)
1943 {
1944         return;
1945 }
1946 #endif
1947
1948 #ifdef CONFIG_DMAR_FLOPPY_WA
1949 static inline void iommu_prepare_isa(void)
1950 {
1951         struct pci_dev *pdev;
1952         int ret;
1953
1954         pdev = pci_get_class(PCI_CLASS_BRIDGE_ISA << 8, NULL);
1955         if (!pdev)
1956                 return;
1957
1958         printk(KERN_INFO "IOMMU: Prepare 0-16M unity mapping for LPC\n");
1959         ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024);
1960
1961         if (ret)
1962                 printk("IOMMU: Failed to create 0-64M identity map, "
1963                         "floppy might not work\n");
1964
1965 }
1966 #else
1967 static inline void iommu_prepare_isa(void)
1968 {
1969         return;
1970 }
1971 #endif /* !CONFIG_DMAR_FLPY_WA */
1972
1973 static int __init init_dmars(void)
1974 {
1975         struct dmar_drhd_unit *drhd;
1976         struct dmar_rmrr_unit *rmrr;
1977         struct pci_dev *pdev;
1978         struct intel_iommu *iommu;
1979         int i, ret, unit = 0;
1980
1981         /*
1982          * for each drhd
1983          *    allocate root
1984          *    initialize and program root entry to not present
1985          * endfor
1986          */
1987         for_each_drhd_unit(drhd) {
1988                 g_num_of_iommus++;
1989                 /*
1990                  * lock not needed as this is only incremented in the single
1991                  * threaded kernel __init code path all other access are read
1992                  * only
1993                  */
1994         }
1995
1996         g_iommus = kcalloc(g_num_of_iommus, sizeof(struct intel_iommu *),
1997                         GFP_KERNEL);
1998         if (!g_iommus) {
1999                 printk(KERN_ERR "Allocating global iommu array failed\n");
2000                 ret = -ENOMEM;
2001                 goto error;
2002         }
2003
2004         deferred_flush = kzalloc(g_num_of_iommus *
2005                 sizeof(struct deferred_flush_tables), GFP_KERNEL);
2006         if (!deferred_flush) {
2007                 kfree(g_iommus);
2008                 ret = -ENOMEM;
2009                 goto error;
2010         }
2011
2012         for_each_drhd_unit(drhd) {
2013                 if (drhd->ignored)
2014                         continue;
2015
2016                 iommu = drhd->iommu;
2017                 g_iommus[iommu->seq_id] = iommu;
2018
2019                 ret = iommu_init_domains(iommu);
2020                 if (ret)
2021                         goto error;
2022
2023                 /*
2024                  * TBD:
2025                  * we could share the same root & context tables
2026                  * amoung all IOMMU's. Need to Split it later.
2027                  */
2028                 ret = iommu_alloc_root_entry(iommu);
2029                 if (ret) {
2030                         printk(KERN_ERR "IOMMU: allocate root entry failed\n");
2031                         goto error;
2032                 }
2033         }
2034
2035         for_each_drhd_unit(drhd) {
2036                 if (drhd->ignored)
2037                         continue;
2038
2039                 iommu = drhd->iommu;
2040                 if (dmar_enable_qi(iommu)) {
2041                         /*
2042                          * Queued Invalidate not enabled, use Register Based
2043                          * Invalidate
2044                          */
2045                         iommu->flush.flush_context = __iommu_flush_context;
2046                         iommu->flush.flush_iotlb = __iommu_flush_iotlb;
2047                         printk(KERN_INFO "IOMMU 0x%Lx: using Register based "
2048                                "invalidation\n",
2049                                (unsigned long long)drhd->reg_base_addr);
2050                 } else {
2051                         iommu->flush.flush_context = qi_flush_context;
2052                         iommu->flush.flush_iotlb = qi_flush_iotlb;
2053                         printk(KERN_INFO "IOMMU 0x%Lx: using Queued "
2054                                "invalidation\n",
2055                                (unsigned long long)drhd->reg_base_addr);
2056                 }
2057         }
2058
2059         /*
2060          * For each rmrr
2061          *   for each dev attached to rmrr
2062          *   do
2063          *     locate drhd for dev, alloc domain for dev
2064          *     allocate free domain
2065          *     allocate page table entries for rmrr
2066          *     if context not allocated for bus
2067          *           allocate and init context
2068          *           set present in root table for this bus
2069          *     init context with domain, translation etc
2070          *    endfor
2071          * endfor
2072          */
2073         for_each_rmrr_units(rmrr) {
2074                 for (i = 0; i < rmrr->devices_cnt; i++) {
2075                         pdev = rmrr->devices[i];
2076                         /* some BIOS lists non-exist devices in DMAR table */
2077                         if (!pdev)
2078                                 continue;
2079                         ret = iommu_prepare_rmrr_dev(rmrr, pdev);
2080                         if (ret)
2081                                 printk(KERN_ERR
2082                                  "IOMMU: mapping reserved region failed\n");
2083                 }
2084         }
2085
2086         iommu_prepare_gfx_mapping();
2087
2088         iommu_prepare_isa();
2089
2090         /*
2091          * for each drhd
2092          *   enable fault log
2093          *   global invalidate context cache
2094          *   global invalidate iotlb
2095          *   enable translation
2096          */
2097         for_each_drhd_unit(drhd) {
2098                 if (drhd->ignored)
2099                         continue;
2100                 iommu = drhd->iommu;
2101                 sprintf (iommu->name, "dmar%d", unit++);
2102
2103                 iommu_flush_write_buffer(iommu);
2104
2105                 ret = dmar_set_interrupt(iommu);
2106                 if (ret)
2107                         goto error;
2108
2109                 iommu_set_root_entry(iommu);
2110
2111                 iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL,
2112                                            0);
2113                 iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH,
2114                                          0);
2115                 iommu_disable_protect_mem_regions(iommu);
2116
2117                 ret = iommu_enable_translation(iommu);
2118                 if (ret)
2119                         goto error;
2120         }
2121
2122         return 0;
2123 error:
2124         for_each_drhd_unit(drhd) {
2125                 if (drhd->ignored)
2126                         continue;
2127                 iommu = drhd->iommu;
2128                 free_iommu(iommu);
2129         }
2130         kfree(g_iommus);
2131         return ret;
2132 }
2133
2134 static inline u64 aligned_size(u64 host_addr, size_t size)
2135 {
2136         u64 addr;
2137         addr = (host_addr & (~PAGE_MASK)) + size;
2138         return PAGE_ALIGN(addr);
2139 }
2140
2141 struct iova *
2142 iommu_alloc_iova(struct dmar_domain *domain, size_t size, u64 end)
2143 {
2144         struct iova *piova;
2145
2146         /* Make sure it's in range */
2147         end = min_t(u64, DOMAIN_MAX_ADDR(domain->gaw), end);
2148         if (!size || (IOVA_START_ADDR + size > end))
2149                 return NULL;
2150
2151         piova = alloc_iova(&domain->iovad,
2152                         size >> PAGE_SHIFT, IOVA_PFN(end), 1);
2153         return piova;
2154 }
2155
2156 static struct iova *
2157 __intel_alloc_iova(struct device *dev, struct dmar_domain *domain,
2158                    size_t size, u64 dma_mask)
2159 {
2160         struct pci_dev *pdev = to_pci_dev(dev);
2161         struct iova *iova = NULL;
2162
2163         if (dma_mask <= DMA_32BIT_MASK || dmar_forcedac)
2164                 iova = iommu_alloc_iova(domain, size, dma_mask);
2165         else {
2166                 /*
2167                  * First try to allocate an io virtual address in
2168                  * DMA_32BIT_MASK and if that fails then try allocating
2169                  * from higher range
2170                  */
2171                 iova = iommu_alloc_iova(domain, size, DMA_32BIT_MASK);
2172                 if (!iova)
2173                         iova = iommu_alloc_iova(domain, size, dma_mask);
2174         }
2175
2176         if (!iova) {
2177                 printk(KERN_ERR"Allocating iova for %s failed", pci_name(pdev));
2178                 return NULL;
2179         }
2180
2181         return iova;
2182 }
2183
2184 static struct dmar_domain *
2185 get_valid_domain_for_dev(struct pci_dev *pdev)
2186 {
2187         struct dmar_domain *domain;
2188         int ret;
2189
2190         domain = get_domain_for_dev(pdev,
2191                         DEFAULT_DOMAIN_ADDRESS_WIDTH);
2192         if (!domain) {
2193                 printk(KERN_ERR
2194                         "Allocating domain for %s failed", pci_name(pdev));
2195                 return NULL;
2196         }
2197
2198         /* make sure context mapping is ok */
2199         if (unlikely(!domain_context_mapped(pdev))) {
2200                 ret = domain_context_mapping(domain, pdev);
2201                 if (ret) {
2202                         printk(KERN_ERR
2203                                 "Domain context map for %s failed",
2204                                 pci_name(pdev));
2205                         return NULL;
2206                 }
2207         }
2208
2209         return domain;
2210 }
2211
2212 static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr,
2213                                      size_t size, int dir, u64 dma_mask)
2214 {
2215         struct pci_dev *pdev = to_pci_dev(hwdev);
2216         struct dmar_domain *domain;
2217         phys_addr_t start_paddr;
2218         struct iova *iova;
2219         int prot = 0;
2220         int ret;
2221         struct intel_iommu *iommu;
2222
2223         BUG_ON(dir == DMA_NONE);
2224         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2225                 return paddr;
2226
2227         domain = get_valid_domain_for_dev(pdev);
2228         if (!domain)
2229                 return 0;
2230
2231         iommu = domain_get_iommu(domain);
2232         size = aligned_size((u64)paddr, size);
2233
2234         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2235         if (!iova)
2236                 goto error;
2237
2238         start_paddr = (phys_addr_t)iova->pfn_lo << PAGE_SHIFT;
2239
2240         /*
2241          * Check if DMAR supports zero-length reads on write only
2242          * mappings..
2243          */
2244         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2245                         !cap_zlr(iommu->cap))
2246                 prot |= DMA_PTE_READ;
2247         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2248                 prot |= DMA_PTE_WRITE;
2249         /*
2250          * paddr - (paddr + size) might be partial page, we should map the whole
2251          * page.  Note: if two part of one page are separately mapped, we
2252          * might have two guest_addr mapping to the same host paddr, but this
2253          * is not a big problem
2254          */
2255         ret = domain_page_mapping(domain, start_paddr,
2256                 ((u64)paddr) & PAGE_MASK, size, prot);
2257         if (ret)
2258                 goto error;
2259
2260         /* it's a non-present to present mapping */
2261         ret = iommu_flush_iotlb_psi(iommu, domain->id,
2262                         start_paddr, size >> VTD_PAGE_SHIFT, 1);
2263         if (ret)
2264                 iommu_flush_write_buffer(iommu);
2265
2266         return start_paddr + ((u64)paddr & (~PAGE_MASK));
2267
2268 error:
2269         if (iova)
2270                 __free_iova(&domain->iovad, iova);
2271         printk(KERN_ERR"Device %s request: %lx@%llx dir %d --- failed\n",
2272                 pci_name(pdev), size, (unsigned long long)paddr, dir);
2273         return 0;
2274 }
2275
2276 dma_addr_t intel_map_single(struct device *hwdev, phys_addr_t paddr,
2277                             size_t size, int dir)
2278 {
2279         return __intel_map_single(hwdev, paddr, size, dir,
2280                                   to_pci_dev(hwdev)->dma_mask);
2281 }
2282
2283 static void flush_unmaps(void)
2284 {
2285         int i, j;
2286
2287         timer_on = 0;
2288
2289         /* just flush them all */
2290         for (i = 0; i < g_num_of_iommus; i++) {
2291                 struct intel_iommu *iommu = g_iommus[i];
2292                 if (!iommu)
2293                         continue;
2294
2295                 if (deferred_flush[i].next) {
2296                         iommu->flush.flush_iotlb(iommu, 0, 0, 0,
2297                                                  DMA_TLB_GLOBAL_FLUSH, 0);
2298                         for (j = 0; j < deferred_flush[i].next; j++) {
2299                                 __free_iova(&deferred_flush[i].domain[j]->iovad,
2300                                                 deferred_flush[i].iova[j]);
2301                         }
2302                         deferred_flush[i].next = 0;
2303                 }
2304         }
2305
2306         list_size = 0;
2307 }
2308
2309 static void flush_unmaps_timeout(unsigned long data)
2310 {
2311         unsigned long flags;
2312
2313         spin_lock_irqsave(&async_umap_flush_lock, flags);
2314         flush_unmaps();
2315         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2316 }
2317
2318 static void add_unmap(struct dmar_domain *dom, struct iova *iova)
2319 {
2320         unsigned long flags;
2321         int next, iommu_id;
2322         struct intel_iommu *iommu;
2323
2324         spin_lock_irqsave(&async_umap_flush_lock, flags);
2325         if (list_size == HIGH_WATER_MARK)
2326                 flush_unmaps();
2327
2328         iommu = domain_get_iommu(dom);
2329         iommu_id = iommu->seq_id;
2330
2331         next = deferred_flush[iommu_id].next;
2332         deferred_flush[iommu_id].domain[next] = dom;
2333         deferred_flush[iommu_id].iova[next] = iova;
2334         deferred_flush[iommu_id].next++;
2335
2336         if (!timer_on) {
2337                 mod_timer(&unmap_timer, jiffies + msecs_to_jiffies(10));
2338                 timer_on = 1;
2339         }
2340         list_size++;
2341         spin_unlock_irqrestore(&async_umap_flush_lock, flags);
2342 }
2343
2344 void intel_unmap_single(struct device *dev, dma_addr_t dev_addr, size_t size,
2345                         int dir)
2346 {
2347         struct pci_dev *pdev = to_pci_dev(dev);
2348         struct dmar_domain *domain;
2349         unsigned long start_addr;
2350         struct iova *iova;
2351         struct intel_iommu *iommu;
2352
2353         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2354                 return;
2355         domain = find_domain(pdev);
2356         BUG_ON(!domain);
2357
2358         iommu = domain_get_iommu(domain);
2359
2360         iova = find_iova(&domain->iovad, IOVA_PFN(dev_addr));
2361         if (!iova)
2362                 return;
2363
2364         start_addr = iova->pfn_lo << PAGE_SHIFT;
2365         size = aligned_size((u64)dev_addr, size);
2366
2367         pr_debug("Device %s unmapping: %lx@%llx\n",
2368                 pci_name(pdev), size, (unsigned long long)start_addr);
2369
2370         /*  clear the whole page */
2371         dma_pte_clear_range(domain, start_addr, start_addr + size);
2372         /* free page tables */
2373         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2374         if (intel_iommu_strict) {
2375                 if (iommu_flush_iotlb_psi(iommu,
2376                         domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0))
2377                         iommu_flush_write_buffer(iommu);
2378                 /* free iova */
2379                 __free_iova(&domain->iovad, iova);
2380         } else {
2381                 add_unmap(domain, iova);
2382                 /*
2383                  * queue up the release of the unmap to save the 1/6th of the
2384                  * cpu used up by the iotlb flush operation...
2385                  */
2386         }
2387 }
2388
2389 void *intel_alloc_coherent(struct device *hwdev, size_t size,
2390                            dma_addr_t *dma_handle, gfp_t flags)
2391 {
2392         void *vaddr;
2393         int order;
2394
2395         size = PAGE_ALIGN(size);
2396         order = get_order(size);
2397         flags &= ~(GFP_DMA | GFP_DMA32);
2398
2399         vaddr = (void *)__get_free_pages(flags, order);
2400         if (!vaddr)
2401                 return NULL;
2402         memset(vaddr, 0, size);
2403
2404         *dma_handle = __intel_map_single(hwdev, virt_to_bus(vaddr), size,
2405                                          DMA_BIDIRECTIONAL,
2406                                          hwdev->coherent_dma_mask);
2407         if (*dma_handle)
2408                 return vaddr;
2409         free_pages((unsigned long)vaddr, order);
2410         return NULL;
2411 }
2412
2413 void intel_free_coherent(struct device *hwdev, size_t size, void *vaddr,
2414                          dma_addr_t dma_handle)
2415 {
2416         int order;
2417
2418         size = PAGE_ALIGN(size);
2419         order = get_order(size);
2420
2421         intel_unmap_single(hwdev, dma_handle, size, DMA_BIDIRECTIONAL);
2422         free_pages((unsigned long)vaddr, order);
2423 }
2424
2425 #define SG_ENT_VIRT_ADDRESS(sg) (sg_virt((sg)))
2426
2427 void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist,
2428                     int nelems, int dir)
2429 {
2430         int i;
2431         struct pci_dev *pdev = to_pci_dev(hwdev);
2432         struct dmar_domain *domain;
2433         unsigned long start_addr;
2434         struct iova *iova;
2435         size_t size = 0;
2436         void *addr;
2437         struct scatterlist *sg;
2438         struct intel_iommu *iommu;
2439
2440         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2441                 return;
2442
2443         domain = find_domain(pdev);
2444         BUG_ON(!domain);
2445
2446         iommu = domain_get_iommu(domain);
2447
2448         iova = find_iova(&domain->iovad, IOVA_PFN(sglist[0].dma_address));
2449         if (!iova)
2450                 return;
2451         for_each_sg(sglist, sg, nelems, i) {
2452                 addr = SG_ENT_VIRT_ADDRESS(sg);
2453                 size += aligned_size((u64)addr, sg->length);
2454         }
2455
2456         start_addr = iova->pfn_lo << PAGE_SHIFT;
2457
2458         /*  clear the whole page */
2459         dma_pte_clear_range(domain, start_addr, start_addr + size);
2460         /* free page tables */
2461         dma_pte_free_pagetable(domain, start_addr, start_addr + size);
2462
2463         if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr,
2464                         size >> VTD_PAGE_SHIFT, 0))
2465                 iommu_flush_write_buffer(iommu);
2466
2467         /* free iova */
2468         __free_iova(&domain->iovad, iova);
2469 }
2470
2471 static int intel_nontranslate_map_sg(struct device *hddev,
2472         struct scatterlist *sglist, int nelems, int dir)
2473 {
2474         int i;
2475         struct scatterlist *sg;
2476
2477         for_each_sg(sglist, sg, nelems, i) {
2478                 BUG_ON(!sg_page(sg));
2479                 sg->dma_address = virt_to_bus(SG_ENT_VIRT_ADDRESS(sg));
2480                 sg->dma_length = sg->length;
2481         }
2482         return nelems;
2483 }
2484
2485 int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int nelems,
2486                  int dir)
2487 {
2488         void *addr;
2489         int i;
2490         struct pci_dev *pdev = to_pci_dev(hwdev);
2491         struct dmar_domain *domain;
2492         size_t size = 0;
2493         int prot = 0;
2494         size_t offset = 0;
2495         struct iova *iova = NULL;
2496         int ret;
2497         struct scatterlist *sg;
2498         unsigned long start_addr;
2499         struct intel_iommu *iommu;
2500
2501         BUG_ON(dir == DMA_NONE);
2502         if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO)
2503                 return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir);
2504
2505         domain = get_valid_domain_for_dev(pdev);
2506         if (!domain)
2507                 return 0;
2508
2509         iommu = domain_get_iommu(domain);
2510
2511         for_each_sg(sglist, sg, nelems, i) {
2512                 addr = SG_ENT_VIRT_ADDRESS(sg);
2513                 addr = (void *)virt_to_phys(addr);
2514                 size += aligned_size((u64)addr, sg->length);
2515         }
2516
2517         iova = __intel_alloc_iova(hwdev, domain, size, pdev->dma_mask);
2518         if (!iova) {
2519                 sglist->dma_length = 0;
2520                 return 0;
2521         }
2522
2523         /*
2524          * Check if DMAR supports zero-length reads on write only
2525          * mappings..
2526          */
2527         if (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL || \
2528                         !cap_zlr(iommu->cap))
2529                 prot |= DMA_PTE_READ;
2530         if (dir == DMA_FROM_DEVICE || dir == DMA_BIDIRECTIONAL)
2531                 prot |= DMA_PTE_WRITE;
2532
2533         start_addr = iova->pfn_lo << PAGE_SHIFT;
2534         offset = 0;
2535         for_each_sg(sglist, sg, nelems, i) {
2536                 addr = SG_ENT_VIRT_ADDRESS(sg);
2537                 addr = (void *)virt_to_phys(addr);
2538                 size = aligned_size((u64)addr, sg->length);
2539                 ret = domain_page_mapping(domain, start_addr + offset,
2540                         ((u64)addr) & PAGE_MASK,
2541                         size, prot);
2542                 if (ret) {
2543                         /*  clear the page */
2544                         dma_pte_clear_range(domain, start_addr,
2545                                   start_addr + offset);
2546                         /* free page tables */
2547                         dma_pte_free_pagetable(domain, start_addr,
2548                                   start_addr + offset);
2549                         /* free iova */
2550                         __free_iova(&domain->iovad, iova);
2551                         return 0;
2552                 }
2553                 sg->dma_address = start_addr + offset +
2554                                 ((u64)addr & (~PAGE_MASK));
2555                 sg->dma_length = sg->length;
2556                 offset += size;
2557         }
2558
2559         /* it's a non-present to present mapping */
2560         if (iommu_flush_iotlb_psi(iommu, domain->id,
2561                         start_addr, offset >> VTD_PAGE_SHIFT, 1))
2562                 iommu_flush_write_buffer(iommu);
2563         return nelems;
2564 }
2565
2566 static struct dma_mapping_ops intel_dma_ops = {
2567         .alloc_coherent = intel_alloc_coherent,
2568         .free_coherent = intel_free_coherent,
2569         .map_single = intel_map_single,
2570         .unmap_single = intel_unmap_single,
2571         .map_sg = intel_map_sg,
2572         .unmap_sg = intel_unmap_sg,
2573 };
2574
2575 static inline int iommu_domain_cache_init(void)
2576 {
2577         int ret = 0;
2578
2579         iommu_domain_cache = kmem_cache_create("iommu_domain",
2580                                          sizeof(struct dmar_domain),
2581                                          0,
2582                                          SLAB_HWCACHE_ALIGN,
2583
2584                                          NULL);
2585         if (!iommu_domain_cache) {
2586                 printk(KERN_ERR "Couldn't create iommu_domain cache\n");
2587                 ret = -ENOMEM;
2588         }
2589
2590         return ret;
2591 }
2592
2593 static inline int iommu_devinfo_cache_init(void)
2594 {
2595         int ret = 0;
2596
2597         iommu_devinfo_cache = kmem_cache_create("iommu_devinfo",
2598                                          sizeof(struct device_domain_info),
2599                                          0,
2600                                          SLAB_HWCACHE_ALIGN,
2601                                          NULL);
2602         if (!iommu_devinfo_cache) {
2603                 printk(KERN_ERR "Couldn't create devinfo cache\n");
2604                 ret = -ENOMEM;
2605         }
2606
2607         return ret;
2608 }
2609
2610 static inline int iommu_iova_cache_init(void)
2611 {
2612         int ret = 0;
2613
2614         iommu_iova_cache = kmem_cache_create("iommu_iova",
2615                                          sizeof(struct iova),
2616                                          0,
2617                                          SLAB_HWCACHE_ALIGN,
2618                                          NULL);
2619         if (!iommu_iova_cache) {
2620                 printk(KERN_ERR "Couldn't create iova cache\n");
2621                 ret = -ENOMEM;
2622         }
2623
2624         return ret;
2625 }
2626
2627 static int __init iommu_init_mempool(void)
2628 {
2629         int ret;
2630         ret = iommu_iova_cache_init();
2631         if (ret)
2632                 return ret;
2633
2634         ret = iommu_domain_cache_init();
2635         if (ret)
2636                 goto domain_error;
2637
2638         ret = iommu_devinfo_cache_init();
2639         if (!ret)
2640                 return ret;
2641
2642         kmem_cache_destroy(iommu_domain_cache);
2643 domain_error:
2644         kmem_cache_destroy(iommu_iova_cache);
2645
2646         return -ENOMEM;
2647 }
2648
2649 static void __init iommu_exit_mempool(void)
2650 {
2651         kmem_cache_destroy(iommu_devinfo_cache);
2652         kmem_cache_destroy(iommu_domain_cache);
2653         kmem_cache_destroy(iommu_iova_cache);
2654
2655 }
2656
2657 static void __init init_no_remapping_devices(void)
2658 {
2659         struct dmar_drhd_unit *drhd;
2660
2661         for_each_drhd_unit(drhd) {
2662                 if (!drhd->include_all) {
2663                         int i;
2664                         for (i = 0; i < drhd->devices_cnt; i++)
2665                                 if (drhd->devices[i] != NULL)
2666                                         break;
2667                         /* ignore DMAR unit if no pci devices exist */
2668                         if (i == drhd->devices_cnt)
2669                                 drhd->ignored = 1;
2670                 }
2671         }
2672
2673         if (dmar_map_gfx)
2674                 return;
2675
2676         for_each_drhd_unit(drhd) {
2677                 int i;
2678                 if (drhd->ignored || drhd->include_all)
2679                         continue;
2680
2681                 for (i = 0; i < drhd->devices_cnt; i++)
2682                         if (drhd->devices[i] &&
2683                                 !IS_GFX_DEVICE(drhd->devices[i]))
2684                                 break;
2685
2686                 if (i < drhd->devices_cnt)
2687                         continue;
2688
2689                 /* bypass IOMMU if it is just for gfx devices */
2690                 drhd->ignored = 1;
2691                 for (i = 0; i < drhd->devices_cnt; i++) {
2692                         if (!drhd->devices[i])
2693                                 continue;
2694                         drhd->devices[i]->dev.archdata.iommu = DUMMY_DEVICE_DOMAIN_INFO;
2695                 }
2696         }
2697 }
2698
2699 int __init intel_iommu_init(void)
2700 {
2701         int ret = 0;
2702
2703         if (dmar_table_init())
2704                 return  -ENODEV;
2705
2706         if (dmar_dev_scope_init())
2707                 return  -ENODEV;
2708
2709         /*
2710          * Check the need for DMA-remapping initialization now.
2711          * Above initialization will also be used by Interrupt-remapping.
2712          */
2713         if (no_iommu || swiotlb || dmar_disabled)
2714                 return -ENODEV;
2715
2716         iommu_init_mempool();
2717         dmar_init_reserved_ranges();
2718
2719         init_no_remapping_devices();
2720
2721         ret = init_dmars();
2722         if (ret) {
2723                 printk(KERN_ERR "IOMMU: dmar init failed\n");
2724                 put_iova_domain(&reserved_iova_list);
2725                 iommu_exit_mempool();
2726                 return ret;
2727         }
2728         printk(KERN_INFO
2729         "PCI-DMA: Intel(R) Virtualization Technology for Directed I/O\n");
2730
2731         init_timer(&unmap_timer);
2732         force_iommu = 1;
2733         dma_ops = &intel_dma_ops;
2734
2735         register_iommu(&intel_iommu_ops);
2736
2737         return 0;
2738 }
2739
2740 static int vm_domain_add_dev_info(struct dmar_domain *domain,
2741                                   struct pci_dev *pdev)
2742 {
2743         struct device_domain_info *info;
2744         unsigned long flags;
2745
2746         info = alloc_devinfo_mem();
2747         if (!info)
2748                 return -ENOMEM;
2749
2750         info->bus = pdev->bus->number;
2751         info->devfn = pdev->devfn;
2752         info->dev = pdev;
2753         info->domain = domain;
2754
2755         spin_lock_irqsave(&device_domain_lock, flags);
2756         list_add(&info->link, &domain->devices);
2757         list_add(&info->global, &device_domain_list);
2758         pdev->dev.archdata.iommu = info;
2759         spin_unlock_irqrestore(&device_domain_lock, flags);
2760
2761         return 0;
2762 }
2763
2764 static void vm_domain_remove_one_dev_info(struct dmar_domain *domain,
2765                                           struct pci_dev *pdev)
2766 {
2767         struct device_domain_info *info;
2768         struct intel_iommu *iommu;
2769         unsigned long flags;
2770         int found = 0;
2771         struct list_head *entry, *tmp;
2772
2773         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
2774         if (!iommu)
2775                 return;
2776
2777         spin_lock_irqsave(&device_domain_lock, flags);
2778         list_for_each_safe(entry, tmp, &domain->devices) {
2779                 info = list_entry(entry, struct device_domain_info, link);
2780                 if (info->bus == pdev->bus->number &&
2781                     info->devfn == pdev->devfn) {
2782                         list_del(&info->link);
2783                         list_del(&info->global);
2784                         if (info->dev)
2785                                 info->dev->dev.archdata.iommu = NULL;
2786                         spin_unlock_irqrestore(&device_domain_lock, flags);
2787
2788                         iommu_detach_dev(iommu, info->bus, info->devfn);
2789                         free_devinfo_mem(info);
2790
2791                         spin_lock_irqsave(&device_domain_lock, flags);
2792
2793                         if (found)
2794                                 break;
2795                         else
2796                                 continue;
2797                 }
2798
2799                 /* if there is no other devices under the same iommu
2800                  * owned by this domain, clear this iommu in iommu_bmp
2801                  * update iommu count and coherency
2802                  */
2803                 if (device_to_iommu(info->bus, info->devfn) == iommu)
2804                         found = 1;
2805         }
2806
2807         if (found == 0) {
2808                 unsigned long tmp_flags;
2809                 spin_lock_irqsave(&domain->iommu_lock, tmp_flags);
2810                 clear_bit(iommu->seq_id, &domain->iommu_bmp);
2811                 domain->iommu_count--;
2812                 domain_update_iommu_coherency(domain);
2813                 spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags);
2814         }
2815
2816         spin_unlock_irqrestore(&device_domain_lock, flags);
2817 }
2818
2819 static void vm_domain_remove_all_dev_info(struct dmar_domain *domain)
2820 {
2821         struct device_domain_info *info;
2822         struct intel_iommu *iommu;
2823         unsigned long flags1, flags2;
2824
2825         spin_lock_irqsave(&device_domain_lock, flags1);
2826         while (!list_empty(&domain->devices)) {
2827                 info = list_entry(domain->devices.next,
2828                         struct device_domain_info, link);
2829                 list_del(&info->link);
2830                 list_del(&info->global);
2831                 if (info->dev)
2832                         info->dev->dev.archdata.iommu = NULL;
2833
2834                 spin_unlock_irqrestore(&device_domain_lock, flags1);
2835
2836                 iommu = device_to_iommu(info->bus, info->devfn);
2837                 iommu_detach_dev(iommu, info->bus, info->devfn);
2838
2839                 /* clear this iommu in iommu_bmp, update iommu count
2840                  * and coherency
2841                  */
2842                 spin_lock_irqsave(&domain->iommu_lock, flags2);
2843                 if (test_and_clear_bit(iommu->seq_id,
2844                                        &domain->iommu_bmp)) {
2845                         domain->iommu_count--;
2846                         domain_update_iommu_coherency(domain);
2847                 }
2848                 spin_unlock_irqrestore(&domain->iommu_lock, flags2);
2849
2850                 free_devinfo_mem(info);
2851                 spin_lock_irqsave(&device_domain_lock, flags1);
2852         }
2853         spin_unlock_irqrestore(&device_domain_lock, flags1);
2854 }
2855
2856 /* domain id for virtual machine, it won't be set in context */
2857 static unsigned long vm_domid;
2858
2859 static int vm_domain_min_agaw(struct dmar_domain *domain)
2860 {
2861         int i;
2862         int min_agaw = domain->agaw;
2863
2864         i = find_first_bit(&domain->iommu_bmp, g_num_of_iommus);
2865         for (; i < g_num_of_iommus; ) {
2866                 if (min_agaw > g_iommus[i]->agaw)
2867                         min_agaw = g_iommus[i]->agaw;
2868
2869                 i = find_next_bit(&domain->iommu_bmp, g_num_of_iommus, i+1);
2870         }
2871
2872         return min_agaw;
2873 }
2874
2875 static struct dmar_domain *iommu_alloc_vm_domain(void)
2876 {
2877         struct dmar_domain *domain;
2878
2879         domain = alloc_domain_mem();
2880         if (!domain)
2881                 return NULL;
2882
2883         domain->id = vm_domid++;
2884         memset(&domain->iommu_bmp, 0, sizeof(unsigned long));
2885         domain->flags = DOMAIN_FLAG_VIRTUAL_MACHINE;
2886
2887         return domain;
2888 }
2889
2890 static int vm_domain_init(struct dmar_domain *domain, int guest_width)
2891 {
2892         int adjust_width;
2893
2894         init_iova_domain(&domain->iovad, DMA_32BIT_PFN);
2895         spin_lock_init(&domain->mapping_lock);
2896         spin_lock_init(&domain->iommu_lock);
2897
2898         domain_reserve_special_ranges(domain);
2899
2900         /* calculate AGAW */
2901         domain->gaw = guest_width;
2902         adjust_width = guestwidth_to_adjustwidth(guest_width);
2903         domain->agaw = width_to_agaw(adjust_width);
2904
2905         INIT_LIST_HEAD(&domain->devices);
2906
2907         domain->iommu_count = 0;
2908         domain->iommu_coherency = 0;
2909         domain->max_addr = 0;
2910
2911         /* always allocate the top pgd */
2912         domain->pgd = (struct dma_pte *)alloc_pgtable_page();
2913         if (!domain->pgd)
2914                 return -ENOMEM;
2915         domain_flush_cache(domain, domain->pgd, PAGE_SIZE);
2916         return 0;
2917 }
2918
2919 static void iommu_free_vm_domain(struct dmar_domain *domain)
2920 {
2921         unsigned long flags;
2922         struct dmar_drhd_unit *drhd;
2923         struct intel_iommu *iommu;
2924         unsigned long i;
2925         unsigned long ndomains;
2926
2927         for_each_drhd_unit(drhd) {
2928                 if (drhd->ignored)
2929                         continue;
2930                 iommu = drhd->iommu;
2931
2932                 ndomains = cap_ndoms(iommu->cap);
2933                 i = find_first_bit(iommu->domain_ids, ndomains);
2934                 for (; i < ndomains; ) {
2935                         if (iommu->domains[i] == domain) {
2936                                 spin_lock_irqsave(&iommu->lock, flags);
2937                                 clear_bit(i, iommu->domain_ids);
2938                                 iommu->domains[i] = NULL;
2939                                 spin_unlock_irqrestore(&iommu->lock, flags);
2940                                 break;
2941                         }
2942                         i = find_next_bit(iommu->domain_ids, ndomains, i+1);
2943                 }
2944         }
2945 }
2946
2947 static void vm_domain_exit(struct dmar_domain *domain)
2948 {
2949         u64 end;
2950
2951         /* Domain 0 is reserved, so dont process it */
2952         if (!domain)
2953                 return;
2954
2955         vm_domain_remove_all_dev_info(domain);
2956         /* destroy iovas */
2957         put_iova_domain(&domain->iovad);
2958         end = DOMAIN_MAX_ADDR(domain->gaw);
2959         end = end & (~VTD_PAGE_MASK);
2960
2961         /* clear ptes */
2962         dma_pte_clear_range(domain, 0, end);
2963
2964         /* free page tables */
2965         dma_pte_free_pagetable(domain, 0, end);
2966
2967         iommu_free_vm_domain(domain);
2968         free_domain_mem(domain);
2969 }
2970
2971 static int intel_iommu_domain_init(struct iommu_domain *domain)
2972 {
2973         struct dmar_domain *dmar_domain;
2974
2975         dmar_domain = iommu_alloc_vm_domain();
2976         if (!dmar_domain) {
2977                 printk(KERN_ERR
2978                         "intel_iommu_domain_init: dmar_domain == NULL\n");
2979                 return -ENOMEM;
2980         }
2981         if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) {
2982                 printk(KERN_ERR
2983                         "intel_iommu_domain_init() failed\n");
2984                 vm_domain_exit(dmar_domain);
2985                 return -ENOMEM;
2986         }
2987         domain->priv = dmar_domain;
2988
2989         return 0;
2990 }
2991
2992 static void intel_iommu_domain_destroy(struct iommu_domain *domain)
2993 {
2994         struct dmar_domain *dmar_domain = domain->priv;
2995
2996         domain->priv = NULL;
2997         vm_domain_exit(dmar_domain);
2998 }
2999
3000 static int intel_iommu_attach_device(struct iommu_domain *domain,
3001                                      struct device *dev)
3002 {
3003         struct dmar_domain *dmar_domain = domain->priv;
3004         struct pci_dev *pdev = to_pci_dev(dev);
3005         struct intel_iommu *iommu;
3006         int addr_width;
3007         u64 end;
3008         int ret;
3009
3010         /* normally pdev is not mapped */
3011         if (unlikely(domain_context_mapped(pdev))) {
3012                 struct dmar_domain *old_domain;
3013
3014                 old_domain = find_domain(pdev);
3015                 if (old_domain) {
3016                         if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE)
3017                                 vm_domain_remove_one_dev_info(old_domain, pdev);
3018                         else
3019                                 domain_remove_dev_info(old_domain);
3020                 }
3021         }
3022
3023         iommu = device_to_iommu(pdev->bus->number, pdev->devfn);
3024         if (!iommu)
3025                 return -ENODEV;
3026
3027         /* check if this iommu agaw is sufficient for max mapped address */
3028         addr_width = agaw_to_width(iommu->agaw);
3029         end = DOMAIN_MAX_ADDR(addr_width);
3030         end = end & VTD_PAGE_MASK;
3031         if (end < dmar_domain->max_addr) {
3032                 printk(KERN_ERR "%s: iommu agaw (%d) is not "
3033                        "sufficient for the mapped address (%llx)\n",
3034                        __func__, iommu->agaw, dmar_domain->max_addr);
3035                 return -EFAULT;
3036         }
3037
3038         ret = domain_context_mapping(dmar_domain, pdev);
3039         if (ret)
3040                 return ret;
3041
3042         ret = vm_domain_add_dev_info(dmar_domain, pdev);
3043         return ret;
3044 }
3045
3046 static void intel_iommu_detach_device(struct iommu_domain *domain,
3047                                       struct device *dev)
3048 {
3049         struct dmar_domain *dmar_domain = domain->priv;
3050         struct pci_dev *pdev = to_pci_dev(dev);
3051
3052         vm_domain_remove_one_dev_info(dmar_domain, pdev);
3053 }
3054
3055 static int intel_iommu_map_range(struct iommu_domain *domain,
3056                                  unsigned long iova, phys_addr_t hpa,
3057                                  size_t size, int iommu_prot)
3058 {
3059         struct dmar_domain *dmar_domain = domain->priv;
3060         u64 max_addr;
3061         int addr_width;
3062         int prot = 0;
3063         int ret;
3064
3065         if (iommu_prot & IOMMU_READ)
3066                 prot |= DMA_PTE_READ;
3067         if (iommu_prot & IOMMU_WRITE)
3068                 prot |= DMA_PTE_WRITE;
3069
3070         max_addr = (iova & VTD_PAGE_MASK) + VTD_PAGE_ALIGN(size);
3071         if (dmar_domain->max_addr < max_addr) {
3072                 int min_agaw;
3073                 u64 end;
3074
3075                 /* check if minimum agaw is sufficient for mapped address */
3076                 min_agaw = vm_domain_min_agaw(dmar_domain);
3077                 addr_width = agaw_to_width(min_agaw);
3078                 end = DOMAIN_MAX_ADDR(addr_width);
3079                 end = end & VTD_PAGE_MASK;
3080                 if (end < max_addr) {
3081                         printk(KERN_ERR "%s: iommu agaw (%d) is not "
3082                                "sufficient for the mapped address (%llx)\n",
3083                                __func__, min_agaw, max_addr);
3084                         return -EFAULT;
3085                 }
3086                 dmar_domain->max_addr = max_addr;
3087         }
3088
3089         ret = domain_page_mapping(dmar_domain, iova, hpa, size, prot);
3090         return ret;
3091 }
3092
3093 static void intel_iommu_unmap_range(struct iommu_domain *domain,
3094                                     unsigned long iova, size_t size)
3095 {
3096         struct dmar_domain *dmar_domain = domain->priv;
3097         dma_addr_t base;
3098
3099         /* The address might not be aligned */
3100         base = iova & VTD_PAGE_MASK;
3101         size = VTD_PAGE_ALIGN(size);
3102         dma_pte_clear_range(dmar_domain, base, base + size);
3103
3104         if (dmar_domain->max_addr == base + size)
3105                 dmar_domain->max_addr = base;
3106 }
3107
3108 static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain,
3109                                             unsigned long iova)
3110 {
3111         struct dmar_domain *dmar_domain = domain->priv;
3112         struct dma_pte *pte;
3113         u64 phys = 0;
3114
3115         pte = addr_to_dma_pte(dmar_domain, iova);
3116         if (pte)
3117                 phys = dma_pte_addr(pte);
3118
3119         return phys;
3120 }
3121
3122 static struct iommu_ops intel_iommu_ops = {
3123         .domain_init    = intel_iommu_domain_init,
3124         .domain_destroy = intel_iommu_domain_destroy,
3125         .attach_dev     = intel_iommu_attach_device,
3126         .detach_dev     = intel_iommu_detach_device,
3127         .map            = intel_iommu_map_range,
3128         .unmap          = intel_iommu_unmap_range,
3129         .iova_to_phys   = intel_iommu_iova_to_phys,
3130 };