Merge branch 'merge' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc
[linux-2.6] / drivers / infiniband / core / umem.c
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Cisco Systems.  All rights reserved.
4  * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  *
34  * $Id: uverbs_mem.c 2743 2005-06-28 22:27:59Z roland $
35  */
36
37 #include <linux/mm.h>
38 #include <linux/dma-mapping.h>
39 #include <linux/sched.h>
40
41 #include "uverbs.h"
42
43 static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
44 {
45         struct ib_umem_chunk *chunk, *tmp;
46         int i;
47
48         list_for_each_entry_safe(chunk, tmp, &umem->chunk_list, list) {
49                 ib_dma_unmap_sg(dev, chunk->page_list,
50                                 chunk->nents, DMA_BIDIRECTIONAL);
51                 for (i = 0; i < chunk->nents; ++i) {
52                         if (umem->writable && dirty)
53                                 set_page_dirty_lock(chunk->page_list[i].page);
54                         put_page(chunk->page_list[i].page);
55                 }
56
57                 kfree(chunk);
58         }
59 }
60
61 /**
62  * ib_umem_get - Pin and DMA map userspace memory.
63  * @context: userspace context to pin memory for
64  * @addr: userspace virtual address to start at
65  * @size: length of region to pin
66  * @access: IB_ACCESS_xxx flags for memory being pinned
67  */
68 struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
69                             size_t size, int access)
70 {
71         struct ib_umem *umem;
72         struct page **page_list;
73         struct ib_umem_chunk *chunk;
74         unsigned long locked;
75         unsigned long lock_limit;
76         unsigned long cur_base;
77         unsigned long npages;
78         int ret;
79         int off;
80         int i;
81
82         if (!can_do_mlock())
83                 return ERR_PTR(-EPERM);
84
85         umem = kmalloc(sizeof *umem, GFP_KERNEL);
86         if (!umem)
87                 return ERR_PTR(-ENOMEM);
88
89         umem->context   = context;
90         umem->length    = size;
91         umem->offset    = addr & ~PAGE_MASK;
92         umem->page_size = PAGE_SIZE;
93         /*
94          * We ask for writable memory if any access flags other than
95          * "remote read" are set.  "Local write" and "remote write"
96          * obviously require write access.  "Remote atomic" can do
97          * things like fetch and add, which will modify memory, and
98          * "MW bind" can change permissions by binding a window.
99          */
100         umem->writable  = !!(access & ~IB_ACCESS_REMOTE_READ);
101
102         INIT_LIST_HEAD(&umem->chunk_list);
103
104         page_list = (struct page **) __get_free_page(GFP_KERNEL);
105         if (!page_list) {
106                 kfree(umem);
107                 return ERR_PTR(-ENOMEM);
108         }
109
110         npages = PAGE_ALIGN(size + umem->offset) >> PAGE_SHIFT;
111
112         down_write(&current->mm->mmap_sem);
113
114         locked     = npages + current->mm->locked_vm;
115         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
116
117         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
118                 ret = -ENOMEM;
119                 goto out;
120         }
121
122         cur_base = addr & PAGE_MASK;
123
124         ret = 0;
125         while (npages) {
126                 ret = get_user_pages(current, current->mm, cur_base,
127                                      min_t(int, npages,
128                                            PAGE_SIZE / sizeof (struct page *)),
129                                      1, !umem->writable, page_list, NULL);
130
131                 if (ret < 0)
132                         goto out;
133
134                 cur_base += ret * PAGE_SIZE;
135                 npages   -= ret;
136
137                 off = 0;
138
139                 while (ret) {
140                         chunk = kmalloc(sizeof *chunk + sizeof (struct scatterlist) *
141                                         min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK),
142                                         GFP_KERNEL);
143                         if (!chunk) {
144                                 ret = -ENOMEM;
145                                 goto out;
146                         }
147
148                         chunk->nents = min_t(int, ret, IB_UMEM_MAX_PAGE_CHUNK);
149                         for (i = 0; i < chunk->nents; ++i) {
150                                 chunk->page_list[i].page   = page_list[i + off];
151                                 chunk->page_list[i].offset = 0;
152                                 chunk->page_list[i].length = PAGE_SIZE;
153                         }
154
155                         chunk->nmap = ib_dma_map_sg(context->device,
156                                                     &chunk->page_list[0],
157                                                     chunk->nents,
158                                                     DMA_BIDIRECTIONAL);
159                         if (chunk->nmap <= 0) {
160                                 for (i = 0; i < chunk->nents; ++i)
161                                         put_page(chunk->page_list[i].page);
162                                 kfree(chunk);
163
164                                 ret = -ENOMEM;
165                                 goto out;
166                         }
167
168                         ret -= chunk->nents;
169                         off += chunk->nents;
170                         list_add_tail(&chunk->list, &umem->chunk_list);
171                 }
172
173                 ret = 0;
174         }
175
176 out:
177         if (ret < 0) {
178                 __ib_umem_release(context->device, umem, 0);
179                 kfree(umem);
180         } else
181                 current->mm->locked_vm = locked;
182
183         up_write(&current->mm->mmap_sem);
184         free_page((unsigned long) page_list);
185
186         return ret < 0 ? ERR_PTR(ret) : umem;
187 }
188 EXPORT_SYMBOL(ib_umem_get);
189
190 static void ib_umem_account(struct work_struct *work)
191 {
192         struct ib_umem *umem = container_of(work, struct ib_umem, work);
193
194         down_write(&umem->mm->mmap_sem);
195         umem->mm->locked_vm -= umem->diff;
196         up_write(&umem->mm->mmap_sem);
197         mmput(umem->mm);
198         kfree(umem);
199 }
200
201 /**
202  * ib_umem_release - release memory pinned with ib_umem_get
203  * @umem: umem struct to release
204  */
205 void ib_umem_release(struct ib_umem *umem)
206 {
207         struct ib_ucontext *context = umem->context;
208         struct mm_struct *mm;
209         unsigned long diff;
210
211         __ib_umem_release(umem->context->device, umem, 1);
212
213         mm = get_task_mm(current);
214         if (!mm) {
215                 kfree(umem);
216                 return;
217         }
218
219         diff = PAGE_ALIGN(umem->length + umem->offset) >> PAGE_SHIFT;
220
221         /*
222          * We may be called with the mm's mmap_sem already held.  This
223          * can happen when a userspace munmap() is the call that drops
224          * the last reference to our file and calls our release
225          * method.  If there are memory regions to destroy, we'll end
226          * up here and not be able to take the mmap_sem.  In that case
227          * we defer the vm_locked accounting to the system workqueue.
228          */
229         if (context->closing) {
230                 if (!down_write_trylock(&mm->mmap_sem)) {
231                         INIT_WORK(&umem->work, ib_umem_account);
232                         umem->mm   = mm;
233                         umem->diff = diff;
234
235                         schedule_work(&umem->work);
236                         return;
237                 }
238         } else
239                 down_write(&mm->mmap_sem);
240
241         current->mm->locked_vm -= diff;
242         up_write(&mm->mmap_sem);
243         mmput(mm);
244         kfree(umem);
245 }
246 EXPORT_SYMBOL(ib_umem_release);
247
248 int ib_umem_page_count(struct ib_umem *umem)
249 {
250         struct ib_umem_chunk *chunk;
251         int shift;
252         int i;
253         int n;
254
255         shift = ilog2(umem->page_size);
256
257         n = 0;
258         list_for_each_entry(chunk, &umem->chunk_list, list)
259                 for (i = 0; i < chunk->nmap; ++i)
260                         n += sg_dma_len(&chunk->page_list[i]) >> shift;
261
262         return n;
263 }
264 EXPORT_SYMBOL(ib_umem_page_count);