2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static inline struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static inline void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static inline void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static inline void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
113 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc);
118 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
120 spin_lock(&fuse_lock);
121 if (req->preallocated)
122 list_add(&req->list, &fc->unused_list);
124 fuse_request_free(req);
126 /* If we are in debt decrease that first */
127 if (fc->outstanding_debt)
128 fc->outstanding_debt--;
130 up(&fc->outstanding_sem);
131 spin_unlock(&fuse_lock);
134 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
136 if (atomic_dec_and_test(&req->count))
137 fuse_putback_request(fc, req);
140 void fuse_release_background(struct fuse_req *req)
146 spin_lock(&fuse_lock);
147 list_del(&req->bg_entry);
148 spin_unlock(&fuse_lock);
151 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
154 struct fuse_init_out *arg = &req->misc.init_out;
156 if (arg->major != FUSE_KERNEL_VERSION)
159 fc->minor = arg->minor;
160 fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
163 /* After INIT reply is received other requests can go
164 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
165 up()s on outstanding_sem. The last up() is done in
166 fuse_putback_request() */
167 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
168 up(&fc->outstanding_sem);
172 * This function is called when a request is finished. Either a reply
173 * has arrived or it was interrupted (and not yet sent) or some error
174 * occurred during communication with userspace, or the device file was
175 * closed. It decreases the reference count for the request. In case
176 * of a background request the reference to the stored objects are
177 * released. The requester thread is woken up (if still waiting), and
178 * finally the request is either freed or put on the unused_list
180 * Called with fuse_lock, unlocks it
182 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
186 putback = atomic_dec_and_test(&req->count);
187 spin_unlock(&fuse_lock);
188 if (req->background) {
189 down_read(&fc->sbput_sem);
191 fuse_release_background(req);
192 up_read(&fc->sbput_sem);
194 wake_up(&req->waitq);
195 if (req->in.h.opcode == FUSE_INIT)
196 process_init_reply(fc, req);
197 else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
198 /* Special case for failed iget in CREATE */
199 u64 nodeid = req->in.h.nodeid;
200 __fuse_get_request(req);
201 fuse_reset_request(req);
202 fuse_send_forget(fc, req, nodeid, 1);
206 fuse_putback_request(fc, req);
210 * Unfortunately request interruption not just solves the deadlock
211 * problem, it causes problems too. These stem from the fact, that an
212 * interrupted request is continued to be processed in userspace,
213 * while all the locks and object references (inode and file) held
214 * during the operation are released.
216 * To release the locks is exactly why there's a need to interrupt the
217 * request, so there's not a lot that can be done about this, except
218 * introduce additional locking in userspace.
220 * More important is to keep inode and file references until userspace
221 * has replied, otherwise FORGET and RELEASE could be sent while the
222 * inode/file is still used by the filesystem.
224 * For this reason the concept of "background" request is introduced.
225 * An interrupted request is backgrounded if it has been already sent
226 * to userspace. Backgrounding involves getting an extra reference to
227 * inode(s) or file used in the request, and adding the request to
228 * fc->background list. When a reply is received for a background
229 * request, the object references are released, and the request is
230 * removed from the list. If the filesystem is unmounted while there
231 * are still background requests, the list is walked and references
232 * are released as if a reply was received.
234 * There's one more use for a background request. The RELEASE message is
235 * always sent as background, since it doesn't return an error or
238 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
241 list_add(&req->bg_entry, &fc->background);
243 req->inode = igrab(req->inode);
245 req->inode2 = igrab(req->inode2);
250 /* Called with fuse_lock held. Releases, and then reacquires it. */
251 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
255 spin_unlock(&fuse_lock);
257 wait_event_interruptible(req->waitq, req->finished);
258 restore_sigs(&oldset);
259 spin_lock(&fuse_lock);
263 req->out.h.error = -EINTR;
264 req->interrupted = 1;
266 /* This is uninterruptible sleep, because data is
267 being copied to/from the buffers of req. During
268 locked state, there mustn't be any filesystem
269 operation (e.g. page fault), since that could lead
271 spin_unlock(&fuse_lock);
272 wait_event(req->waitq, !req->locked);
273 spin_lock(&fuse_lock);
275 if (!req->sent && !list_empty(&req->list)) {
276 list_del(&req->list);
277 __fuse_put_request(req);
278 } else if (!req->finished && req->sent)
279 background_request(fc, req);
282 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
287 for (i = 0; i < numargs; i++)
288 nbytes += args[i].size;
293 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
296 /* zero is special */
299 req->in.h.unique = fc->reqctr;
300 req->in.h.len = sizeof(struct fuse_in_header) +
301 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
302 if (!req->preallocated) {
303 /* If request is not preallocated (either FORGET or
304 RELEASE), then still decrease outstanding_sem, so
305 user can't open infinite number of files while not
306 processing the RELEASE requests. However for
307 efficiency do it without blocking, so if down()
308 would block, just increase the debt instead */
309 if (down_trylock(&fc->outstanding_sem))
310 fc->outstanding_debt++;
312 list_add_tail(&req->list, &fc->pending);
317 * This can only be interrupted by a SIGKILL
319 void request_send(struct fuse_conn *fc, struct fuse_req *req)
322 spin_lock(&fuse_lock);
324 req->out.h.error = -ENOTCONN;
325 else if (fc->conn_error)
326 req->out.h.error = -ECONNREFUSED;
328 queue_request(fc, req);
329 /* acquire extra reference, since request is still needed
330 after request_end() */
331 __fuse_get_request(req);
333 request_wait_answer(fc, req);
335 spin_unlock(&fuse_lock);
338 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
340 spin_lock(&fuse_lock);
342 queue_request(fc, req);
343 spin_unlock(&fuse_lock);
345 req->out.h.error = -ENOTCONN;
346 request_end(fc, req);
350 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
353 request_send_nowait(fc, req);
356 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
359 spin_lock(&fuse_lock);
360 background_request(fc, req);
361 spin_unlock(&fuse_lock);
362 request_send_nowait(fc, req);
365 void fuse_send_init(struct fuse_conn *fc)
367 /* This is called from fuse_read_super() so there's guaranteed
368 to be a request available */
369 struct fuse_req *req = do_get_request(fc);
370 struct fuse_init_in *arg = &req->misc.init_in;
371 arg->major = FUSE_KERNEL_VERSION;
372 arg->minor = FUSE_KERNEL_MINOR_VERSION;
373 req->in.h.opcode = FUSE_INIT;
375 req->in.args[0].size = sizeof(*arg);
376 req->in.args[0].value = arg;
377 req->out.numargs = 1;
378 /* Variable length arguement used for backward compatibility
379 with interface version < 7.5. Rest of init_out is zeroed
380 by do_get_request(), so a short reply is not a problem */
382 req->out.args[0].size = sizeof(struct fuse_init_out);
383 req->out.args[0].value = &req->misc.init_out;
384 request_send_background(fc, req);
388 * Lock the request. Up to the next unlock_request() there mustn't be
389 * anything that could cause a page-fault. If the request was already
390 * interrupted bail out.
392 static inline int lock_request(struct fuse_req *req)
396 spin_lock(&fuse_lock);
397 if (req->interrupted)
401 spin_unlock(&fuse_lock);
407 * Unlock request. If it was interrupted during being locked, the
408 * requester thread is currently waiting for it to be unlocked, so
411 static inline void unlock_request(struct fuse_req *req)
414 spin_lock(&fuse_lock);
416 if (req->interrupted)
417 wake_up(&req->waitq);
418 spin_unlock(&fuse_lock);
422 struct fuse_copy_state {
424 struct fuse_req *req;
425 const struct iovec *iov;
426 unsigned long nr_segs;
427 unsigned long seglen;
435 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
436 struct fuse_req *req, const struct iovec *iov,
437 unsigned long nr_segs)
439 memset(cs, 0, sizeof(*cs));
443 cs->nr_segs = nr_segs;
446 /* Unmap and put previous page of userspace buffer */
447 static inline void fuse_copy_finish(struct fuse_copy_state *cs)
450 kunmap_atomic(cs->mapaddr, KM_USER0);
452 flush_dcache_page(cs->pg);
453 set_page_dirty_lock(cs->pg);
461 * Get another pagefull of userspace buffer, and map it to kernel
462 * address space, and lock request
464 static int fuse_copy_fill(struct fuse_copy_state *cs)
466 unsigned long offset;
469 unlock_request(cs->req);
470 fuse_copy_finish(cs);
472 BUG_ON(!cs->nr_segs);
473 cs->seglen = cs->iov[0].iov_len;
474 cs->addr = (unsigned long) cs->iov[0].iov_base;
478 down_read(¤t->mm->mmap_sem);
479 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
481 up_read(¤t->mm->mmap_sem);
485 offset = cs->addr % PAGE_SIZE;
486 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
487 cs->buf = cs->mapaddr + offset;
488 cs->len = min(PAGE_SIZE - offset, cs->seglen);
489 cs->seglen -= cs->len;
492 return lock_request(cs->req);
495 /* Do as much copy to/from userspace buffer as we can */
496 static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
499 unsigned ncpy = min(*size, cs->len);
502 memcpy(cs->buf, *val, ncpy);
504 memcpy(*val, cs->buf, ncpy);
514 * Copy a page in the request to/from the userspace buffer. Must be
517 static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
518 unsigned offset, unsigned count, int zeroing)
520 if (page && zeroing && count < PAGE_SIZE) {
521 void *mapaddr = kmap_atomic(page, KM_USER1);
522 memset(mapaddr, 0, PAGE_SIZE);
523 kunmap_atomic(mapaddr, KM_USER1);
527 if (!cs->len && (err = fuse_copy_fill(cs)))
530 void *mapaddr = kmap_atomic(page, KM_USER1);
531 void *buf = mapaddr + offset;
532 offset += fuse_copy_do(cs, &buf, &count);
533 kunmap_atomic(mapaddr, KM_USER1);
535 offset += fuse_copy_do(cs, NULL, &count);
537 if (page && !cs->write)
538 flush_dcache_page(page);
542 /* Copy pages in the request to/from userspace buffer */
543 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
547 struct fuse_req *req = cs->req;
548 unsigned offset = req->page_offset;
549 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
551 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
552 struct page *page = req->pages[i];
553 int err = fuse_copy_page(cs, page, offset, count, zeroing);
558 count = min(nbytes, (unsigned) PAGE_SIZE);
564 /* Copy a single argument in the request to/from userspace buffer */
565 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
569 if (!cs->len && (err = fuse_copy_fill(cs)))
571 fuse_copy_do(cs, &val, &size);
576 /* Copy request arguments to/from userspace buffer */
577 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
578 unsigned argpages, struct fuse_arg *args,
584 for (i = 0; !err && i < numargs; i++) {
585 struct fuse_arg *arg = &args[i];
586 if (i == numargs - 1 && argpages)
587 err = fuse_copy_pages(cs, arg->size, zeroing);
589 err = fuse_copy_one(cs, arg->value, arg->size);
594 /* Wait until a request is available on the pending list */
595 static void request_wait(struct fuse_conn *fc)
597 DECLARE_WAITQUEUE(wait, current);
599 add_wait_queue_exclusive(&fc->waitq, &wait);
600 while (fc->mounted && list_empty(&fc->pending)) {
601 set_current_state(TASK_INTERRUPTIBLE);
602 if (signal_pending(current))
605 spin_unlock(&fuse_lock);
607 spin_lock(&fuse_lock);
609 set_current_state(TASK_RUNNING);
610 remove_wait_queue(&fc->waitq, &wait);
614 * Read a single request into the userspace filesystem's buffer. This
615 * function waits until a request is available, then removes it from
616 * the pending list and copies request data to userspace buffer. If
617 * no reply is needed (FORGET) or request has been interrupted or
618 * there was an error during the copying then it's finished by calling
619 * request_end(). Otherwise add it to the processing list, and set
622 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
623 unsigned long nr_segs, loff_t *off)
626 struct fuse_conn *fc;
627 struct fuse_req *req;
629 struct fuse_copy_state cs;
633 spin_lock(&fuse_lock);
634 fc = file->private_data;
643 if (list_empty(&fc->pending))
646 req = list_entry(fc->pending.next, struct fuse_req, list);
647 list_del_init(&req->list);
651 /* If request is too large, reply with an error and restart the read */
652 if (iov_length(iov, nr_segs) < reqsize) {
653 req->out.h.error = -EIO;
654 /* SETXATTR is special, since it may contain too large data */
655 if (in->h.opcode == FUSE_SETXATTR)
656 req->out.h.error = -E2BIG;
657 request_end(fc, req);
660 spin_unlock(&fuse_lock);
661 fuse_copy_init(&cs, 1, req, iov, nr_segs);
662 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
664 err = fuse_copy_args(&cs, in->numargs, in->argpages,
665 (struct fuse_arg *) in->args, 0);
666 fuse_copy_finish(&cs);
667 spin_lock(&fuse_lock);
669 if (!err && req->interrupted)
672 if (!req->interrupted)
673 req->out.h.error = -EIO;
674 request_end(fc, req);
678 request_end(fc, req);
681 list_add_tail(&req->list, &fc->processing);
682 spin_unlock(&fuse_lock);
687 spin_unlock(&fuse_lock);
691 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
692 size_t nbytes, loff_t *off)
695 iov.iov_len = nbytes;
697 return fuse_dev_readv(file, &iov, 1, off);
700 /* Look up request on processing list by unique ID */
701 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
703 struct list_head *entry;
705 list_for_each(entry, &fc->processing) {
706 struct fuse_req *req;
707 req = list_entry(entry, struct fuse_req, list);
708 if (req->in.h.unique == unique)
714 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
717 unsigned reqsize = sizeof(struct fuse_out_header);
720 return nbytes != reqsize ? -EINVAL : 0;
722 reqsize += len_args(out->numargs, out->args);
724 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
726 else if (reqsize > nbytes) {
727 struct fuse_arg *lastarg = &out->args[out->numargs-1];
728 unsigned diffsize = reqsize - nbytes;
729 if (diffsize > lastarg->size)
731 lastarg->size -= diffsize;
733 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
738 * Write a single reply to a request. First the header is copied from
739 * the write buffer. The request is then searched on the processing
740 * list by the unique ID found in the header. If found, then remove
741 * it from the list and copy the rest of the buffer to the request.
742 * The request is finished by calling request_end()
744 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
745 unsigned long nr_segs, loff_t *off)
748 unsigned nbytes = iov_length(iov, nr_segs);
749 struct fuse_req *req;
750 struct fuse_out_header oh;
751 struct fuse_copy_state cs;
752 struct fuse_conn *fc = fuse_get_conn(file);
756 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
757 if (nbytes < sizeof(struct fuse_out_header))
760 err = fuse_copy_one(&cs, &oh, sizeof(oh));
764 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
768 spin_lock(&fuse_lock);
769 req = request_find(fc, oh.unique);
774 list_del_init(&req->list);
775 if (req->interrupted) {
776 spin_unlock(&fuse_lock);
777 fuse_copy_finish(&cs);
778 spin_lock(&fuse_lock);
779 request_end(fc, req);
785 spin_unlock(&fuse_lock);
787 err = copy_out_args(&cs, &req->out, nbytes);
788 fuse_copy_finish(&cs);
790 spin_lock(&fuse_lock);
793 if (req->interrupted)
795 } else if (!req->interrupted)
796 req->out.h.error = -EIO;
797 request_end(fc, req);
799 return err ? err : nbytes;
802 spin_unlock(&fuse_lock);
804 fuse_copy_finish(&cs);
808 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
809 size_t nbytes, loff_t *off)
812 iov.iov_len = nbytes;
813 iov.iov_base = (char __user *) buf;
814 return fuse_dev_writev(file, &iov, 1, off);
817 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
819 struct fuse_conn *fc = fuse_get_conn(file);
820 unsigned mask = POLLOUT | POLLWRNORM;
825 poll_wait(file, &fc->waitq, wait);
827 spin_lock(&fuse_lock);
828 if (!list_empty(&fc->pending))
829 mask |= POLLIN | POLLRDNORM;
830 spin_unlock(&fuse_lock);
835 /* Abort all requests on the given list (pending or processing) */
836 static void end_requests(struct fuse_conn *fc, struct list_head *head)
838 while (!list_empty(head)) {
839 struct fuse_req *req;
840 req = list_entry(head->next, struct fuse_req, list);
841 list_del_init(&req->list);
842 req->out.h.error = -ECONNABORTED;
843 request_end(fc, req);
844 spin_lock(&fuse_lock);
848 static int fuse_dev_release(struct inode *inode, struct file *file)
850 struct fuse_conn *fc;
852 spin_lock(&fuse_lock);
853 fc = file->private_data;
856 end_requests(fc, &fc->pending);
857 end_requests(fc, &fc->processing);
858 fuse_release_conn(fc);
860 spin_unlock(&fuse_lock);
864 struct file_operations fuse_dev_operations = {
865 .owner = THIS_MODULE,
867 .read = fuse_dev_read,
868 .readv = fuse_dev_readv,
869 .write = fuse_dev_write,
870 .writev = fuse_dev_writev,
871 .poll = fuse_dev_poll,
872 .release = fuse_dev_release,
875 static struct miscdevice fuse_miscdevice = {
878 .fops = &fuse_dev_operations,
881 int __init fuse_dev_init(void)
884 fuse_req_cachep = kmem_cache_create("fuse_request",
885 sizeof(struct fuse_req),
887 if (!fuse_req_cachep)
890 err = misc_register(&fuse_miscdevice);
892 goto out_cache_clean;
897 kmem_cache_destroy(fuse_req_cachep);
902 void fuse_dev_cleanup(void)
904 misc_deregister(&fuse_miscdevice);
905 kmem_cache_destroy(fuse_req_cachep);