2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
113 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc);
118 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
120 spin_lock(&fuse_lock);
121 if (req->preallocated)
122 list_add(&req->list, &fc->unused_list);
124 fuse_request_free(req);
126 /* If we are in debt decrease that first */
127 if (fc->outstanding_debt)
128 fc->outstanding_debt--;
130 up(&fc->outstanding_sem);
131 spin_unlock(&fuse_lock);
134 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
136 if (atomic_dec_and_test(&req->count))
137 fuse_putback_request(fc, req);
140 void fuse_release_background(struct fuse_req *req)
146 spin_lock(&fuse_lock);
147 list_del(&req->bg_entry);
148 spin_unlock(&fuse_lock);
151 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
154 struct fuse_init_out *arg = &req->misc.init_out;
156 if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
159 fc->minor = arg->minor;
160 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
163 /* After INIT reply is received other requests can go
164 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
165 up()s on outstanding_sem. The last up() is done in
166 fuse_putback_request() */
167 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
168 up(&fc->outstanding_sem);
172 * This function is called when a request is finished. Either a reply
173 * has arrived or it was interrupted (and not yet sent) or some error
174 * occurred during communication with userspace, or the device file
175 * was closed. In case of a background request the reference to the
176 * stored objects are released. The requester thread is woken up (if
177 * still waiting), and finally the reference to the request is
180 * Called with fuse_lock, unlocks it
182 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
184 req->state = FUSE_REQ_FINISHED;
185 spin_unlock(&fuse_lock);
186 if (req->background) {
187 down_read(&fc->sbput_sem);
189 fuse_release_background(req);
190 up_read(&fc->sbput_sem);
192 wake_up(&req->waitq);
193 if (req->in.h.opcode == FUSE_INIT)
194 process_init_reply(fc, req);
195 else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
196 /* Special case for failed iget in CREATE */
197 u64 nodeid = req->in.h.nodeid;
198 fuse_reset_request(req);
199 fuse_send_forget(fc, req, nodeid, 1);
202 fuse_put_request(fc, req);
206 * Unfortunately request interruption not just solves the deadlock
207 * problem, it causes problems too. These stem from the fact, that an
208 * interrupted request is continued to be processed in userspace,
209 * while all the locks and object references (inode and file) held
210 * during the operation are released.
212 * To release the locks is exactly why there's a need to interrupt the
213 * request, so there's not a lot that can be done about this, except
214 * introduce additional locking in userspace.
216 * More important is to keep inode and file references until userspace
217 * has replied, otherwise FORGET and RELEASE could be sent while the
218 * inode/file is still used by the filesystem.
220 * For this reason the concept of "background" request is introduced.
221 * An interrupted request is backgrounded if it has been already sent
222 * to userspace. Backgrounding involves getting an extra reference to
223 * inode(s) or file used in the request, and adding the request to
224 * fc->background list. When a reply is received for a background
225 * request, the object references are released, and the request is
226 * removed from the list. If the filesystem is unmounted while there
227 * are still background requests, the list is walked and references
228 * are released as if a reply was received.
230 * There's one more use for a background request. The RELEASE message is
231 * always sent as background, since it doesn't return an error or
234 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
237 list_add(&req->bg_entry, &fc->background);
239 req->inode = igrab(req->inode);
241 req->inode2 = igrab(req->inode2);
246 /* Called with fuse_lock held. Releases, and then reacquires it. */
247 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
251 spin_unlock(&fuse_lock);
253 wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
254 restore_sigs(&oldset);
255 spin_lock(&fuse_lock);
256 if (req->state == FUSE_REQ_FINISHED)
259 req->out.h.error = -EINTR;
260 req->interrupted = 1;
262 /* This is uninterruptible sleep, because data is
263 being copied to/from the buffers of req. During
264 locked state, there mustn't be any filesystem
265 operation (e.g. page fault), since that could lead
267 spin_unlock(&fuse_lock);
268 wait_event(req->waitq, !req->locked);
269 spin_lock(&fuse_lock);
271 if (req->state == FUSE_REQ_PENDING) {
272 list_del(&req->list);
273 __fuse_put_request(req);
274 } else if (req->state == FUSE_REQ_SENT)
275 background_request(fc, req);
278 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
283 for (i = 0; i < numargs; i++)
284 nbytes += args[i].size;
289 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
292 /* zero is special */
295 req->in.h.unique = fc->reqctr;
296 req->in.h.len = sizeof(struct fuse_in_header) +
297 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
298 if (!req->preallocated) {
299 /* If request is not preallocated (either FORGET or
300 RELEASE), then still decrease outstanding_sem, so
301 user can't open infinite number of files while not
302 processing the RELEASE requests. However for
303 efficiency do it without blocking, so if down()
304 would block, just increase the debt instead */
305 if (down_trylock(&fc->outstanding_sem))
306 fc->outstanding_debt++;
308 list_add_tail(&req->list, &fc->pending);
309 req->state = FUSE_REQ_PENDING;
314 * This can only be interrupted by a SIGKILL
316 void request_send(struct fuse_conn *fc, struct fuse_req *req)
319 spin_lock(&fuse_lock);
321 req->out.h.error = -ENOTCONN;
322 else if (fc->conn_error)
323 req->out.h.error = -ECONNREFUSED;
325 queue_request(fc, req);
326 /* acquire extra reference, since request is still needed
327 after request_end() */
328 __fuse_get_request(req);
330 request_wait_answer(fc, req);
332 spin_unlock(&fuse_lock);
335 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
337 spin_lock(&fuse_lock);
339 queue_request(fc, req);
340 spin_unlock(&fuse_lock);
342 req->out.h.error = -ENOTCONN;
343 request_end(fc, req);
347 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
350 request_send_nowait(fc, req);
353 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
356 spin_lock(&fuse_lock);
357 background_request(fc, req);
358 spin_unlock(&fuse_lock);
359 request_send_nowait(fc, req);
362 void fuse_send_init(struct fuse_conn *fc)
364 /* This is called from fuse_read_super() so there's guaranteed
365 to be exactly one request available */
366 struct fuse_req *req = fuse_get_request(fc);
367 struct fuse_init_in *arg = &req->misc.init_in;
368 arg->major = FUSE_KERNEL_VERSION;
369 arg->minor = FUSE_KERNEL_MINOR_VERSION;
370 req->in.h.opcode = FUSE_INIT;
372 req->in.args[0].size = sizeof(*arg);
373 req->in.args[0].value = arg;
374 req->out.numargs = 1;
375 /* Variable length arguement used for backward compatibility
376 with interface version < 7.5. Rest of init_out is zeroed
377 by do_get_request(), so a short reply is not a problem */
379 req->out.args[0].size = sizeof(struct fuse_init_out);
380 req->out.args[0].value = &req->misc.init_out;
381 request_send_background(fc, req);
385 * Lock the request. Up to the next unlock_request() there mustn't be
386 * anything that could cause a page-fault. If the request was already
387 * interrupted bail out.
389 static int lock_request(struct fuse_req *req)
393 spin_lock(&fuse_lock);
394 if (req->interrupted)
398 spin_unlock(&fuse_lock);
404 * Unlock request. If it was interrupted during being locked, the
405 * requester thread is currently waiting for it to be unlocked, so
408 static void unlock_request(struct fuse_req *req)
411 spin_lock(&fuse_lock);
413 if (req->interrupted)
414 wake_up(&req->waitq);
415 spin_unlock(&fuse_lock);
419 struct fuse_copy_state {
421 struct fuse_req *req;
422 const struct iovec *iov;
423 unsigned long nr_segs;
424 unsigned long seglen;
432 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
433 struct fuse_req *req, const struct iovec *iov,
434 unsigned long nr_segs)
436 memset(cs, 0, sizeof(*cs));
440 cs->nr_segs = nr_segs;
443 /* Unmap and put previous page of userspace buffer */
444 static void fuse_copy_finish(struct fuse_copy_state *cs)
447 kunmap_atomic(cs->mapaddr, KM_USER0);
449 flush_dcache_page(cs->pg);
450 set_page_dirty_lock(cs->pg);
458 * Get another pagefull of userspace buffer, and map it to kernel
459 * address space, and lock request
461 static int fuse_copy_fill(struct fuse_copy_state *cs)
463 unsigned long offset;
466 unlock_request(cs->req);
467 fuse_copy_finish(cs);
469 BUG_ON(!cs->nr_segs);
470 cs->seglen = cs->iov[0].iov_len;
471 cs->addr = (unsigned long) cs->iov[0].iov_base;
475 down_read(¤t->mm->mmap_sem);
476 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
478 up_read(¤t->mm->mmap_sem);
482 offset = cs->addr % PAGE_SIZE;
483 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
484 cs->buf = cs->mapaddr + offset;
485 cs->len = min(PAGE_SIZE - offset, cs->seglen);
486 cs->seglen -= cs->len;
489 return lock_request(cs->req);
492 /* Do as much copy to/from userspace buffer as we can */
493 static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
495 unsigned ncpy = min(*size, cs->len);
498 memcpy(cs->buf, *val, ncpy);
500 memcpy(*val, cs->buf, ncpy);
510 * Copy a page in the request to/from the userspace buffer. Must be
513 static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
514 unsigned offset, unsigned count, int zeroing)
516 if (page && zeroing && count < PAGE_SIZE) {
517 void *mapaddr = kmap_atomic(page, KM_USER1);
518 memset(mapaddr, 0, PAGE_SIZE);
519 kunmap_atomic(mapaddr, KM_USER1);
523 if (!cs->len && (err = fuse_copy_fill(cs)))
526 void *mapaddr = kmap_atomic(page, KM_USER1);
527 void *buf = mapaddr + offset;
528 offset += fuse_copy_do(cs, &buf, &count);
529 kunmap_atomic(mapaddr, KM_USER1);
531 offset += fuse_copy_do(cs, NULL, &count);
533 if (page && !cs->write)
534 flush_dcache_page(page);
538 /* Copy pages in the request to/from userspace buffer */
539 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
543 struct fuse_req *req = cs->req;
544 unsigned offset = req->page_offset;
545 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
547 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
548 struct page *page = req->pages[i];
549 int err = fuse_copy_page(cs, page, offset, count, zeroing);
554 count = min(nbytes, (unsigned) PAGE_SIZE);
560 /* Copy a single argument in the request to/from userspace buffer */
561 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
565 if (!cs->len && (err = fuse_copy_fill(cs)))
567 fuse_copy_do(cs, &val, &size);
572 /* Copy request arguments to/from userspace buffer */
573 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
574 unsigned argpages, struct fuse_arg *args,
580 for (i = 0; !err && i < numargs; i++) {
581 struct fuse_arg *arg = &args[i];
582 if (i == numargs - 1 && argpages)
583 err = fuse_copy_pages(cs, arg->size, zeroing);
585 err = fuse_copy_one(cs, arg->value, arg->size);
590 /* Wait until a request is available on the pending list */
591 static void request_wait(struct fuse_conn *fc)
593 DECLARE_WAITQUEUE(wait, current);
595 add_wait_queue_exclusive(&fc->waitq, &wait);
596 while (fc->mounted && list_empty(&fc->pending)) {
597 set_current_state(TASK_INTERRUPTIBLE);
598 if (signal_pending(current))
601 spin_unlock(&fuse_lock);
603 spin_lock(&fuse_lock);
605 set_current_state(TASK_RUNNING);
606 remove_wait_queue(&fc->waitq, &wait);
610 * Read a single request into the userspace filesystem's buffer. This
611 * function waits until a request is available, then removes it from
612 * the pending list and copies request data to userspace buffer. If
613 * no reply is needed (FORGET) or request has been interrupted or
614 * there was an error during the copying then it's finished by calling
615 * request_end(). Otherwise add it to the processing list, and set
618 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
619 unsigned long nr_segs, loff_t *off)
622 struct fuse_conn *fc;
623 struct fuse_req *req;
625 struct fuse_copy_state cs;
629 spin_lock(&fuse_lock);
630 fc = file->private_data;
639 if (list_empty(&fc->pending))
642 req = list_entry(fc->pending.next, struct fuse_req, list);
643 req->state = FUSE_REQ_READING;
644 list_del_init(&req->list);
648 /* If request is too large, reply with an error and restart the read */
649 if (iov_length(iov, nr_segs) < reqsize) {
650 req->out.h.error = -EIO;
651 /* SETXATTR is special, since it may contain too large data */
652 if (in->h.opcode == FUSE_SETXATTR)
653 req->out.h.error = -E2BIG;
654 request_end(fc, req);
657 spin_unlock(&fuse_lock);
658 fuse_copy_init(&cs, 1, req, iov, nr_segs);
659 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
661 err = fuse_copy_args(&cs, in->numargs, in->argpages,
662 (struct fuse_arg *) in->args, 0);
663 fuse_copy_finish(&cs);
664 spin_lock(&fuse_lock);
666 if (!err && req->interrupted)
669 if (!req->interrupted)
670 req->out.h.error = -EIO;
671 request_end(fc, req);
675 request_end(fc, req);
677 req->state = FUSE_REQ_SENT;
678 list_add_tail(&req->list, &fc->processing);
679 spin_unlock(&fuse_lock);
684 spin_unlock(&fuse_lock);
688 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
689 size_t nbytes, loff_t *off)
692 iov.iov_len = nbytes;
694 return fuse_dev_readv(file, &iov, 1, off);
697 /* Look up request on processing list by unique ID */
698 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
700 struct list_head *entry;
702 list_for_each(entry, &fc->processing) {
703 struct fuse_req *req;
704 req = list_entry(entry, struct fuse_req, list);
705 if (req->in.h.unique == unique)
711 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
714 unsigned reqsize = sizeof(struct fuse_out_header);
717 return nbytes != reqsize ? -EINVAL : 0;
719 reqsize += len_args(out->numargs, out->args);
721 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
723 else if (reqsize > nbytes) {
724 struct fuse_arg *lastarg = &out->args[out->numargs-1];
725 unsigned diffsize = reqsize - nbytes;
726 if (diffsize > lastarg->size)
728 lastarg->size -= diffsize;
730 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
735 * Write a single reply to a request. First the header is copied from
736 * the write buffer. The request is then searched on the processing
737 * list by the unique ID found in the header. If found, then remove
738 * it from the list and copy the rest of the buffer to the request.
739 * The request is finished by calling request_end()
741 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
742 unsigned long nr_segs, loff_t *off)
745 unsigned nbytes = iov_length(iov, nr_segs);
746 struct fuse_req *req;
747 struct fuse_out_header oh;
748 struct fuse_copy_state cs;
749 struct fuse_conn *fc = fuse_get_conn(file);
753 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
754 if (nbytes < sizeof(struct fuse_out_header))
757 err = fuse_copy_one(&cs, &oh, sizeof(oh));
761 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
765 spin_lock(&fuse_lock);
766 req = request_find(fc, oh.unique);
771 list_del_init(&req->list);
772 if (req->interrupted) {
773 spin_unlock(&fuse_lock);
774 fuse_copy_finish(&cs);
775 spin_lock(&fuse_lock);
776 request_end(fc, req);
782 spin_unlock(&fuse_lock);
784 err = copy_out_args(&cs, &req->out, nbytes);
785 fuse_copy_finish(&cs);
787 spin_lock(&fuse_lock);
790 if (req->interrupted)
792 } else if (!req->interrupted)
793 req->out.h.error = -EIO;
794 request_end(fc, req);
796 return err ? err : nbytes;
799 spin_unlock(&fuse_lock);
801 fuse_copy_finish(&cs);
805 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
806 size_t nbytes, loff_t *off)
809 iov.iov_len = nbytes;
810 iov.iov_base = (char __user *) buf;
811 return fuse_dev_writev(file, &iov, 1, off);
814 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
816 struct fuse_conn *fc = fuse_get_conn(file);
817 unsigned mask = POLLOUT | POLLWRNORM;
822 poll_wait(file, &fc->waitq, wait);
824 spin_lock(&fuse_lock);
825 if (!list_empty(&fc->pending))
826 mask |= POLLIN | POLLRDNORM;
827 spin_unlock(&fuse_lock);
832 /* Abort all requests on the given list (pending or processing) */
833 static void end_requests(struct fuse_conn *fc, struct list_head *head)
835 while (!list_empty(head)) {
836 struct fuse_req *req;
837 req = list_entry(head->next, struct fuse_req, list);
838 list_del_init(&req->list);
839 req->out.h.error = -ECONNABORTED;
840 request_end(fc, req);
841 spin_lock(&fuse_lock);
845 static int fuse_dev_release(struct inode *inode, struct file *file)
847 struct fuse_conn *fc;
849 spin_lock(&fuse_lock);
850 fc = file->private_data;
853 end_requests(fc, &fc->pending);
854 end_requests(fc, &fc->processing);
855 fuse_release_conn(fc);
857 spin_unlock(&fuse_lock);
861 struct file_operations fuse_dev_operations = {
862 .owner = THIS_MODULE,
864 .read = fuse_dev_read,
865 .readv = fuse_dev_readv,
866 .write = fuse_dev_write,
867 .writev = fuse_dev_writev,
868 .poll = fuse_dev_poll,
869 .release = fuse_dev_release,
872 static struct miscdevice fuse_miscdevice = {
875 .fops = &fuse_dev_operations,
878 int __init fuse_dev_init(void)
881 fuse_req_cachep = kmem_cache_create("fuse_request",
882 sizeof(struct fuse_req),
884 if (!fuse_req_cachep)
887 err = misc_register(&fuse_miscdevice);
889 goto out_cache_clean;
894 kmem_cache_destroy(fuse_req_cachep);
899 void fuse_dev_cleanup(void)
901 misc_deregister(&fuse_miscdevice);
902 kmem_cache_destroy(fuse_req_cachep);