2 * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation
4 * Rewrite, cleanup, new allocation schemes, virtual merging:
5 * Copyright (C) 2004 Olof Johansson, IBM Corporation
6 * and Ben. Herrenschmidt, IBM Corporation
8 * Dynamic DMA mapping support, bus-independent parts.
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
26 #include <linux/config.h>
27 #include <linux/init.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
31 #include <linux/spinlock.h>
32 #include <linux/string.h>
33 #include <linux/dma-mapping.h>
34 #include <linux/init.h>
35 #include <linux/bitops.h>
38 #include <asm/iommu.h>
39 #include <asm/pci-bridge.h>
40 #include <asm/machdep.h>
41 #include <asm/kdump.h>
45 #ifdef CONFIG_IOMMU_VMERGE
46 static int novmerge = 0;
48 static int novmerge = 1;
51 static int __init setup_iommu(char *str)
53 if (!strcmp(str, "novmerge"))
55 else if (!strcmp(str, "vmerge"))
60 __setup("iommu=", setup_iommu);
62 static unsigned long iommu_range_alloc(struct iommu_table *tbl,
64 unsigned long *handle,
66 unsigned int align_order)
68 unsigned long n, end, i, start;
70 int largealloc = npages > 15;
72 unsigned long align_mask;
74 align_mask = 0xffffffffffffffffl >> (64 - align_order);
76 /* This allocator was derived from x86_64's bit string search */
79 if (unlikely(npages) == 0) {
80 if (printk_ratelimit())
82 return DMA_ERROR_CODE;
85 if (handle && *handle)
88 start = largealloc ? tbl->it_largehint : tbl->it_hint;
90 /* Use only half of the table for small allocs (15 pages or less) */
91 limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
93 if (largealloc && start < tbl->it_halfpoint)
94 start = tbl->it_halfpoint;
96 /* The case below can happen if we have a small segment appended
97 * to a large, or when the previous alloc was at the very end of
98 * the available space. If so, go back to the initial start.
101 start = largealloc ? tbl->it_largehint : tbl->it_hint;
105 if (limit + tbl->it_offset > mask) {
106 limit = mask - tbl->it_offset + 1;
107 /* If we're constrained on address range, first try
108 * at the masked hint to avoid O(n) search complexity,
109 * but on second pass, start at 0.
111 if ((start & mask) >= limit || pass > 0)
117 n = find_next_zero_bit(tbl->it_map, limit, start);
119 /* Align allocation */
120 n = (n + align_mask) & ~align_mask;
124 if (unlikely(end >= limit)) {
125 if (likely(pass < 2)) {
126 /* First failure, just rescan the half of the table.
127 * Second failure, rescan the other half of the table.
129 start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
130 limit = pass ? tbl->it_size : limit;
134 /* Third failure, give up */
135 return DMA_ERROR_CODE;
139 for (i = n; i < end; i++)
140 if (test_bit(i, tbl->it_map)) {
145 for (i = n; i < end; i++)
146 __set_bit(i, tbl->it_map);
148 /* Bump the hint to a new block for small allocs. */
150 /* Don't bump to new block to avoid fragmentation */
151 tbl->it_largehint = end;
153 /* Overflow will be taken care of at the next allocation */
154 tbl->it_hint = (end + tbl->it_blocksize - 1) &
155 ~(tbl->it_blocksize - 1);
158 /* Update handle for SG allocations */
165 static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page,
166 unsigned int npages, enum dma_data_direction direction,
167 unsigned long mask, unsigned int align_order)
169 unsigned long entry, flags;
170 dma_addr_t ret = DMA_ERROR_CODE;
172 spin_lock_irqsave(&(tbl->it_lock), flags);
174 entry = iommu_range_alloc(tbl, npages, NULL, mask, align_order);
176 if (unlikely(entry == DMA_ERROR_CODE)) {
177 spin_unlock_irqrestore(&(tbl->it_lock), flags);
178 return DMA_ERROR_CODE;
181 entry += tbl->it_offset; /* Offset into real TCE table */
182 ret = entry << PAGE_SHIFT; /* Set the return dma address */
184 /* Put the TCEs in the HW table */
185 ppc_md.tce_build(tbl, entry, npages, (unsigned long)page & PAGE_MASK,
189 /* Flush/invalidate TLB caches if necessary */
190 if (ppc_md.tce_flush)
191 ppc_md.tce_flush(tbl);
193 spin_unlock_irqrestore(&(tbl->it_lock), flags);
195 /* Make sure updates are seen by hardware */
201 static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
204 unsigned long entry, free_entry;
207 entry = dma_addr >> PAGE_SHIFT;
208 free_entry = entry - tbl->it_offset;
210 if (((free_entry + npages) > tbl->it_size) ||
211 (entry < tbl->it_offset)) {
212 if (printk_ratelimit()) {
213 printk(KERN_INFO "iommu_free: invalid entry\n");
214 printk(KERN_INFO "\tentry = 0x%lx\n", entry);
215 printk(KERN_INFO "\tdma_addr = 0x%lx\n", (u64)dma_addr);
216 printk(KERN_INFO "\tTable = 0x%lx\n", (u64)tbl);
217 printk(KERN_INFO "\tbus# = 0x%lx\n", (u64)tbl->it_busno);
218 printk(KERN_INFO "\tsize = 0x%lx\n", (u64)tbl->it_size);
219 printk(KERN_INFO "\tstartOff = 0x%lx\n", (u64)tbl->it_offset);
220 printk(KERN_INFO "\tindex = 0x%lx\n", (u64)tbl->it_index);
226 ppc_md.tce_free(tbl, entry, npages);
228 for (i = 0; i < npages; i++)
229 __clear_bit(free_entry+i, tbl->it_map);
232 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
237 spin_lock_irqsave(&(tbl->it_lock), flags);
239 __iommu_free(tbl, dma_addr, npages);
241 /* Make sure TLB cache is flushed if the HW needs it. We do
242 * not do an mb() here on purpose, it is not needed on any of
243 * the current platforms.
245 if (ppc_md.tce_flush)
246 ppc_md.tce_flush(tbl);
248 spin_unlock_irqrestore(&(tbl->it_lock), flags);
251 int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
252 struct scatterlist *sglist, int nelems,
253 unsigned long mask, enum dma_data_direction direction)
255 dma_addr_t dma_next = 0, dma_addr;
257 struct scatterlist *s, *outs, *segstart;
258 int outcount, incount;
259 unsigned long handle;
261 BUG_ON(direction == DMA_NONE);
263 if ((nelems == 0) || !tbl)
266 outs = s = segstart = &sglist[0];
271 /* Init first segment length for backout at failure */
272 outs->dma_length = 0;
274 DBG("mapping %d elements:\n", nelems);
276 spin_lock_irqsave(&(tbl->it_lock), flags);
278 for (s = outs; nelems; nelems--, s++) {
279 unsigned long vaddr, npages, entry, slen;
287 /* Allocate iommu entries for that segment */
288 vaddr = (unsigned long)page_address(s->page) + s->offset;
289 npages = PAGE_ALIGN(vaddr + slen) - (vaddr & PAGE_MASK);
290 npages >>= PAGE_SHIFT;
291 entry = iommu_range_alloc(tbl, npages, &handle, mask >> PAGE_SHIFT, 0);
293 DBG(" - vaddr: %lx, size: %lx\n", vaddr, slen);
296 if (unlikely(entry == DMA_ERROR_CODE)) {
297 if (printk_ratelimit())
298 printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %lx"
299 " npages %lx\n", tbl, vaddr, npages);
303 /* Convert entry to a dma_addr_t */
304 entry += tbl->it_offset;
305 dma_addr = entry << PAGE_SHIFT;
306 dma_addr |= s->offset;
308 DBG(" - %lx pages, entry: %lx, dma_addr: %lx\n",
309 npages, entry, dma_addr);
311 /* Insert into HW table */
312 ppc_md.tce_build(tbl, entry, npages, vaddr & PAGE_MASK, direction);
314 /* If we are in an open segment, try merging */
316 DBG(" - trying merge...\n");
317 /* We cannot merge if:
318 * - allocated dma_addr isn't contiguous to previous allocation
320 if (novmerge || (dma_addr != dma_next)) {
321 /* Can't merge: create a new segment */
324 DBG(" can't merge, new segment.\n");
326 outs->dma_length += s->length;
327 DBG(" merged, new len: %lx\n", outs->dma_length);
332 /* This is a new segment, fill entries */
333 DBG(" - filling new segment.\n");
334 outs->dma_address = dma_addr;
335 outs->dma_length = slen;
338 /* Calculate next page pointer for contiguous check */
339 dma_next = dma_addr + slen;
341 DBG(" - dma next is: %lx\n", dma_next);
344 /* Flush/invalidate TLB caches if necessary */
345 if (ppc_md.tce_flush)
346 ppc_md.tce_flush(tbl);
348 spin_unlock_irqrestore(&(tbl->it_lock), flags);
350 DBG("mapped %d elements:\n", outcount);
352 /* For the sake of iommu_unmap_sg, we clear out the length in the
353 * next entry of the sglist if we didn't fill the list completely
355 if (outcount < incount) {
357 outs->dma_address = DMA_ERROR_CODE;
358 outs->dma_length = 0;
361 /* Make sure updates are seen by hardware */
367 for (s = &sglist[0]; s <= outs; s++) {
368 if (s->dma_length != 0) {
369 unsigned long vaddr, npages;
371 vaddr = s->dma_address & PAGE_MASK;
372 npages = (PAGE_ALIGN(s->dma_address + s->dma_length) - vaddr)
374 __iommu_free(tbl, vaddr, npages);
375 s->dma_address = DMA_ERROR_CODE;
379 spin_unlock_irqrestore(&(tbl->it_lock), flags);
384 void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
385 int nelems, enum dma_data_direction direction)
389 BUG_ON(direction == DMA_NONE);
394 spin_lock_irqsave(&(tbl->it_lock), flags);
398 dma_addr_t dma_handle = sglist->dma_address;
400 if (sglist->dma_length == 0)
402 npages = (PAGE_ALIGN(dma_handle + sglist->dma_length)
403 - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT;
404 __iommu_free(tbl, dma_handle, npages);
408 /* Flush/invalidate TLBs if necessary. As for iommu_free(), we
409 * do not do an mb() here, the affected platforms do not need it
412 if (ppc_md.tce_flush)
413 ppc_md.tce_flush(tbl);
415 spin_unlock_irqrestore(&(tbl->it_lock), flags);
419 * Build a iommu_table structure. This contains a bit map which
420 * is used to manage allocation of the tce space.
422 struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
425 static int welcomed = 0;
428 /* Set aside 1/4 of the table for large allocations. */
429 tbl->it_halfpoint = tbl->it_size * 3 / 4;
431 /* number of bytes needed for the bitmap */
432 sz = (tbl->it_size + 7) >> 3;
434 page = alloc_pages_node(nid, GFP_ATOMIC, get_order(sz));
436 panic("iommu_init_table: Can't allocate %ld bytes\n", sz);
437 tbl->it_map = page_address(page);
438 memset(tbl->it_map, 0, sz);
441 tbl->it_largehint = tbl->it_halfpoint;
442 spin_lock_init(&tbl->it_lock);
444 #ifdef CONFIG_CRASH_DUMP
445 if (ppc_md.tce_get) {
446 unsigned long index, tceval;
447 unsigned long tcecount = 0;
450 * Reserve the existing mappings left by the first kernel.
452 for (index = 0; index < tbl->it_size; index++) {
453 tceval = ppc_md.tce_get(tbl, index + tbl->it_offset);
455 * Freed TCE entry contains 0x7fffffffffffffff on JS20
457 if (tceval && (tceval != 0x7fffffffffffffffUL)) {
458 __set_bit(index, tbl->it_map);
462 if ((tbl->it_size - tcecount) < KDUMP_MIN_TCE_ENTRIES) {
463 printk(KERN_WARNING "TCE table is full; ");
464 printk(KERN_WARNING "freeing %d entries for the kdump boot\n",
465 KDUMP_MIN_TCE_ENTRIES);
466 for (index = tbl->it_size - KDUMP_MIN_TCE_ENTRIES;
467 index < tbl->it_size; index++)
468 __clear_bit(index, tbl->it_map);
472 /* Clear the hardware table in case firmware left allocations in it */
473 ppc_md.tce_free(tbl, tbl->it_offset, tbl->it_size);
477 printk(KERN_INFO "IOMMU table initialized, virtual merging %s\n",
478 novmerge ? "disabled" : "enabled");
485 void iommu_free_table(struct device_node *dn)
487 struct pci_dn *pdn = dn->data;
488 struct iommu_table *tbl = pdn->iommu_table;
489 unsigned long bitmap_sz, i;
492 if (!tbl || !tbl->it_map) {
493 printk(KERN_ERR "%s: expected TCE map for %s\n", __FUNCTION__,
498 /* verify that table contains no entries */
499 /* it_size is in entries, and we're examining 64 at a time */
500 for (i = 0; i < (tbl->it_size/64); i++) {
501 if (tbl->it_map[i] != 0) {
502 printk(KERN_WARNING "%s: Unexpected TCEs for %s\n",
503 __FUNCTION__, dn->full_name);
508 /* calculate bitmap size in bytes */
509 bitmap_sz = (tbl->it_size + 7) / 8;
512 order = get_order(bitmap_sz);
513 free_pages((unsigned long) tbl->it_map, order);
519 /* Creates TCEs for a user provided buffer. The user buffer must be
520 * contiguous real kernel storage (not vmalloc). The address of the buffer
521 * passed here is the kernel (virtual) address of the buffer. The buffer
522 * need not be page aligned, the dma_addr_t returned will point to the same
523 * byte within the page as vaddr.
525 dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr,
526 size_t size, unsigned long mask,
527 enum dma_data_direction direction)
529 dma_addr_t dma_handle = DMA_ERROR_CODE;
533 BUG_ON(direction == DMA_NONE);
535 uaddr = (unsigned long)vaddr;
536 npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK);
537 npages >>= PAGE_SHIFT;
540 dma_handle = iommu_alloc(tbl, vaddr, npages, direction,
541 mask >> PAGE_SHIFT, 0);
542 if (dma_handle == DMA_ERROR_CODE) {
543 if (printk_ratelimit()) {
544 printk(KERN_INFO "iommu_alloc failed, "
545 "tbl %p vaddr %p npages %d\n",
549 dma_handle |= (uaddr & ~PAGE_MASK);
555 void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle,
556 size_t size, enum dma_data_direction direction)
558 BUG_ON(direction == DMA_NONE);
561 iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) -
562 (dma_handle & PAGE_MASK)) >> PAGE_SHIFT);
565 /* Allocates a contiguous real buffer and creates mappings over it.
566 * Returns the virtual address of the buffer and sets dma_handle
567 * to the dma address (mapping) of the first page.
569 void *iommu_alloc_coherent(struct iommu_table *tbl, size_t size,
570 dma_addr_t *dma_handle, unsigned long mask, gfp_t flag, int node)
574 unsigned int npages, order;
577 size = PAGE_ALIGN(size);
578 npages = size >> PAGE_SHIFT;
579 order = get_order(size);
582 * Client asked for way too much space. This is checked later
583 * anyway. It is easier to debug here for the drivers than in
586 if (order >= IOMAP_MAX_ORDER) {
587 printk("iommu_alloc_consistent size too large: 0x%lx\n", size);
594 /* Alloc enough pages (and possibly more) */
595 page = alloc_pages_node(node, flag, order);
598 ret = page_address(page);
599 memset(ret, 0, size);
601 /* Set up tces to cover the allocated range */
602 mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL,
603 mask >> PAGE_SHIFT, order);
604 if (mapping == DMA_ERROR_CODE) {
605 free_pages((unsigned long)ret, order);
608 *dma_handle = mapping;
612 void iommu_free_coherent(struct iommu_table *tbl, size_t size,
613 void *vaddr, dma_addr_t dma_handle)
618 size = PAGE_ALIGN(size);
619 npages = size >> PAGE_SHIFT;
620 iommu_free(tbl, dma_handle, npages);
621 free_pages((unsigned long)vaddr, get_order(size));