2 FUSE: Filesystem in Userspace
3 Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu>
5 This program can be distributed under the terms of the GNU GPL.
11 #include <linux/init.h>
12 #include <linux/module.h>
13 #include <linux/poll.h>
14 #include <linux/uio.h>
15 #include <linux/miscdevice.h>
16 #include <linux/pagemap.h>
17 #include <linux/file.h>
18 #include <linux/slab.h>
20 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
22 static kmem_cache_t *fuse_req_cachep;
24 static inline struct fuse_conn *fuse_get_conn(struct file *file)
27 spin_lock(&fuse_lock);
28 fc = file->private_data;
29 if (fc && !fc->mounted)
31 spin_unlock(&fuse_lock);
35 static inline void fuse_request_init(struct fuse_req *req)
37 memset(req, 0, sizeof(*req));
38 INIT_LIST_HEAD(&req->list);
39 init_waitqueue_head(&req->waitq);
40 atomic_set(&req->count, 1);
43 struct fuse_req *fuse_request_alloc(void)
45 struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, SLAB_KERNEL);
47 fuse_request_init(req);
51 void fuse_request_free(struct fuse_req *req)
53 kmem_cache_free(fuse_req_cachep, req);
56 static inline void block_sigs(sigset_t *oldset)
60 siginitsetinv(&mask, sigmask(SIGKILL));
61 sigprocmask(SIG_BLOCK, &mask, oldset);
64 static inline void restore_sigs(sigset_t *oldset)
66 sigprocmask(SIG_SETMASK, oldset, NULL);
69 void fuse_reset_request(struct fuse_req *req)
71 int preallocated = req->preallocated;
72 BUG_ON(atomic_read(&req->count) != 1);
73 fuse_request_init(req);
74 req->preallocated = preallocated;
77 static void __fuse_get_request(struct fuse_req *req)
79 atomic_inc(&req->count);
82 /* Must be called with > 1 refcount */
83 static void __fuse_put_request(struct fuse_req *req)
85 BUG_ON(atomic_read(&req->count) < 2);
86 atomic_dec(&req->count);
89 static struct fuse_req *do_get_request(struct fuse_conn *fc)
93 spin_lock(&fuse_lock);
94 BUG_ON(list_empty(&fc->unused_list));
95 req = list_entry(fc->unused_list.next, struct fuse_req, list);
96 list_del_init(&req->list);
97 spin_unlock(&fuse_lock);
98 fuse_request_init(req);
99 req->preallocated = 1;
100 req->in.h.uid = current->fsuid;
101 req->in.h.gid = current->fsgid;
102 req->in.h.pid = current->pid;
106 /* This can return NULL, but only in case it's interrupted by a SIGKILL */
107 struct fuse_req *fuse_get_request(struct fuse_conn *fc)
113 intr = down_interruptible(&fc->outstanding_sem);
114 restore_sigs(&oldset);
115 return intr ? NULL : do_get_request(fc);
118 static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
120 spin_lock(&fuse_lock);
121 if (req->preallocated)
122 list_add(&req->list, &fc->unused_list);
124 fuse_request_free(req);
126 /* If we are in debt decrease that first */
127 if (fc->outstanding_debt)
128 fc->outstanding_debt--;
130 up(&fc->outstanding_sem);
131 spin_unlock(&fuse_lock);
134 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
136 if (atomic_dec_and_test(&req->count))
137 fuse_putback_request(fc, req);
140 void fuse_release_background(struct fuse_req *req)
146 spin_lock(&fuse_lock);
147 list_del(&req->bg_entry);
148 spin_unlock(&fuse_lock);
152 * This function is called when a request is finished. Either a reply
153 * has arrived or it was interrupted (and not yet sent) or some error
154 * occurred during communication with userspace, or the device file was
155 * closed. It decreases the reference count for the request. In case
156 * of a background request the reference to the stored objects are
157 * released. The requester thread is woken up (if still waiting), and
158 * finally the request is either freed or put on the unused_list
160 * Called with fuse_lock, unlocks it
162 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
166 putback = atomic_dec_and_test(&req->count);
167 spin_unlock(&fuse_lock);
168 if (req->background) {
169 down_read(&fc->sbput_sem);
171 fuse_release_background(req);
172 up_read(&fc->sbput_sem);
174 wake_up(&req->waitq);
175 if (req->in.h.opcode == FUSE_INIT) {
178 if (req->misc.init_in_out.major != FUSE_KERNEL_VERSION)
181 fc->minor = req->misc.init_in_out.minor;
183 /* After INIT reply is received other requests can go
184 out. So do (FUSE_MAX_OUTSTANDING - 1) number of
185 up()s on outstanding_sem. The last up() is done in
186 fuse_putback_request() */
187 for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
188 up(&fc->outstanding_sem);
189 } else if (req->in.h.opcode == FUSE_RELEASE && req->inode == NULL) {
190 /* Special case for failed iget in CREATE */
191 u64 nodeid = req->in.h.nodeid;
192 __fuse_get_request(req);
193 fuse_reset_request(req);
194 fuse_send_forget(fc, req, nodeid, 1);
198 fuse_putback_request(fc, req);
202 * Unfortunately request interruption not just solves the deadlock
203 * problem, it causes problems too. These stem from the fact, that an
204 * interrupted request is continued to be processed in userspace,
205 * while all the locks and object references (inode and file) held
206 * during the operation are released.
208 * To release the locks is exactly why there's a need to interrupt the
209 * request, so there's not a lot that can be done about this, except
210 * introduce additional locking in userspace.
212 * More important is to keep inode and file references until userspace
213 * has replied, otherwise FORGET and RELEASE could be sent while the
214 * inode/file is still used by the filesystem.
216 * For this reason the concept of "background" request is introduced.
217 * An interrupted request is backgrounded if it has been already sent
218 * to userspace. Backgrounding involves getting an extra reference to
219 * inode(s) or file used in the request, and adding the request to
220 * fc->background list. When a reply is received for a background
221 * request, the object references are released, and the request is
222 * removed from the list. If the filesystem is unmounted while there
223 * are still background requests, the list is walked and references
224 * are released as if a reply was received.
226 * There's one more use for a background request. The RELEASE message is
227 * always sent as background, since it doesn't return an error or
230 static void background_request(struct fuse_conn *fc, struct fuse_req *req)
233 list_add(&req->bg_entry, &fc->background);
235 req->inode = igrab(req->inode);
237 req->inode2 = igrab(req->inode2);
242 /* Called with fuse_lock held. Releases, and then reacquires it. */
243 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
247 spin_unlock(&fuse_lock);
249 wait_event_interruptible(req->waitq, req->finished);
250 restore_sigs(&oldset);
251 spin_lock(&fuse_lock);
255 req->out.h.error = -EINTR;
256 req->interrupted = 1;
258 /* This is uninterruptible sleep, because data is
259 being copied to/from the buffers of req. During
260 locked state, there mustn't be any filesystem
261 operation (e.g. page fault), since that could lead
263 spin_unlock(&fuse_lock);
264 wait_event(req->waitq, !req->locked);
265 spin_lock(&fuse_lock);
267 if (!req->sent && !list_empty(&req->list)) {
268 list_del(&req->list);
269 __fuse_put_request(req);
270 } else if (!req->finished && req->sent)
271 background_request(fc, req);
274 static unsigned len_args(unsigned numargs, struct fuse_arg *args)
279 for (i = 0; i < numargs; i++)
280 nbytes += args[i].size;
285 static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
288 /* zero is special */
291 req->in.h.unique = fc->reqctr;
292 req->in.h.len = sizeof(struct fuse_in_header) +
293 len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
294 if (!req->preallocated) {
295 /* If request is not preallocated (either FORGET or
296 RELEASE), then still decrease outstanding_sem, so
297 user can't open infinite number of files while not
298 processing the RELEASE requests. However for
299 efficiency do it without blocking, so if down()
300 would block, just increase the debt instead */
301 if (down_trylock(&fc->outstanding_sem))
302 fc->outstanding_debt++;
304 list_add_tail(&req->list, &fc->pending);
309 * This can only be interrupted by a SIGKILL
311 void request_send(struct fuse_conn *fc, struct fuse_req *req)
314 spin_lock(&fuse_lock);
316 req->out.h.error = -ENOTCONN;
317 else if (fc->conn_error)
318 req->out.h.error = -ECONNREFUSED;
320 queue_request(fc, req);
321 /* acquire extra reference, since request is still needed
322 after request_end() */
323 __fuse_get_request(req);
325 request_wait_answer(fc, req);
327 spin_unlock(&fuse_lock);
330 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
332 spin_lock(&fuse_lock);
334 queue_request(fc, req);
335 spin_unlock(&fuse_lock);
337 req->out.h.error = -ENOTCONN;
338 request_end(fc, req);
342 void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
345 request_send_nowait(fc, req);
348 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
351 spin_lock(&fuse_lock);
352 background_request(fc, req);
353 spin_unlock(&fuse_lock);
354 request_send_nowait(fc, req);
357 void fuse_send_init(struct fuse_conn *fc)
359 /* This is called from fuse_read_super() so there's guaranteed
360 to be a request available */
361 struct fuse_req *req = do_get_request(fc);
362 struct fuse_init_in_out *arg = &req->misc.init_in_out;
363 arg->major = FUSE_KERNEL_VERSION;
364 arg->minor = FUSE_KERNEL_MINOR_VERSION;
365 req->in.h.opcode = FUSE_INIT;
367 req->in.args[0].size = sizeof(*arg);
368 req->in.args[0].value = arg;
369 req->out.numargs = 1;
370 req->out.args[0].size = sizeof(*arg);
371 req->out.args[0].value = arg;
372 request_send_background(fc, req);
376 * Lock the request. Up to the next unlock_request() there mustn't be
377 * anything that could cause a page-fault. If the request was already
378 * interrupted bail out.
380 static inline int lock_request(struct fuse_req *req)
384 spin_lock(&fuse_lock);
385 if (req->interrupted)
389 spin_unlock(&fuse_lock);
395 * Unlock request. If it was interrupted during being locked, the
396 * requester thread is currently waiting for it to be unlocked, so
399 static inline void unlock_request(struct fuse_req *req)
402 spin_lock(&fuse_lock);
404 if (req->interrupted)
405 wake_up(&req->waitq);
406 spin_unlock(&fuse_lock);
410 struct fuse_copy_state {
412 struct fuse_req *req;
413 const struct iovec *iov;
414 unsigned long nr_segs;
415 unsigned long seglen;
423 static void fuse_copy_init(struct fuse_copy_state *cs, int write,
424 struct fuse_req *req, const struct iovec *iov,
425 unsigned long nr_segs)
427 memset(cs, 0, sizeof(*cs));
431 cs->nr_segs = nr_segs;
434 /* Unmap and put previous page of userspace buffer */
435 static inline void fuse_copy_finish(struct fuse_copy_state *cs)
438 kunmap_atomic(cs->mapaddr, KM_USER0);
440 flush_dcache_page(cs->pg);
441 set_page_dirty_lock(cs->pg);
449 * Get another pagefull of userspace buffer, and map it to kernel
450 * address space, and lock request
452 static int fuse_copy_fill(struct fuse_copy_state *cs)
454 unsigned long offset;
457 unlock_request(cs->req);
458 fuse_copy_finish(cs);
460 BUG_ON(!cs->nr_segs);
461 cs->seglen = cs->iov[0].iov_len;
462 cs->addr = (unsigned long) cs->iov[0].iov_base;
466 down_read(¤t->mm->mmap_sem);
467 err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
469 up_read(¤t->mm->mmap_sem);
473 offset = cs->addr % PAGE_SIZE;
474 cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
475 cs->buf = cs->mapaddr + offset;
476 cs->len = min(PAGE_SIZE - offset, cs->seglen);
477 cs->seglen -= cs->len;
480 return lock_request(cs->req);
483 /* Do as much copy to/from userspace buffer as we can */
484 static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
487 unsigned ncpy = min(*size, cs->len);
490 memcpy(cs->buf, *val, ncpy);
492 memcpy(*val, cs->buf, ncpy);
502 * Copy a page in the request to/from the userspace buffer. Must be
505 static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
506 unsigned offset, unsigned count, int zeroing)
508 if (page && zeroing && count < PAGE_SIZE) {
509 void *mapaddr = kmap_atomic(page, KM_USER1);
510 memset(mapaddr, 0, PAGE_SIZE);
511 kunmap_atomic(mapaddr, KM_USER1);
515 if (!cs->len && (err = fuse_copy_fill(cs)))
518 void *mapaddr = kmap_atomic(page, KM_USER1);
519 void *buf = mapaddr + offset;
520 offset += fuse_copy_do(cs, &buf, &count);
521 kunmap_atomic(mapaddr, KM_USER1);
523 offset += fuse_copy_do(cs, NULL, &count);
525 if (page && !cs->write)
526 flush_dcache_page(page);
530 /* Copy pages in the request to/from userspace buffer */
531 static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
535 struct fuse_req *req = cs->req;
536 unsigned offset = req->page_offset;
537 unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
539 for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
540 struct page *page = req->pages[i];
541 int err = fuse_copy_page(cs, page, offset, count, zeroing);
546 count = min(nbytes, (unsigned) PAGE_SIZE);
552 /* Copy a single argument in the request to/from userspace buffer */
553 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
557 if (!cs->len && (err = fuse_copy_fill(cs)))
559 fuse_copy_do(cs, &val, &size);
564 /* Copy request arguments to/from userspace buffer */
565 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
566 unsigned argpages, struct fuse_arg *args,
572 for (i = 0; !err && i < numargs; i++) {
573 struct fuse_arg *arg = &args[i];
574 if (i == numargs - 1 && argpages)
575 err = fuse_copy_pages(cs, arg->size, zeroing);
577 err = fuse_copy_one(cs, arg->value, arg->size);
582 /* Wait until a request is available on the pending list */
583 static void request_wait(struct fuse_conn *fc)
585 DECLARE_WAITQUEUE(wait, current);
587 add_wait_queue_exclusive(&fc->waitq, &wait);
588 while (fc->mounted && list_empty(&fc->pending)) {
589 set_current_state(TASK_INTERRUPTIBLE);
590 if (signal_pending(current))
593 spin_unlock(&fuse_lock);
595 spin_lock(&fuse_lock);
597 set_current_state(TASK_RUNNING);
598 remove_wait_queue(&fc->waitq, &wait);
602 * Read a single request into the userspace filesystem's buffer. This
603 * function waits until a request is available, then removes it from
604 * the pending list and copies request data to userspace buffer. If
605 * no reply is needed (FORGET) or request has been interrupted or
606 * there was an error during the copying then it's finished by calling
607 * request_end(). Otherwise add it to the processing list, and set
610 static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
611 unsigned long nr_segs, loff_t *off)
614 struct fuse_conn *fc;
615 struct fuse_req *req;
617 struct fuse_copy_state cs;
620 spin_lock(&fuse_lock);
621 fc = file->private_data;
630 if (list_empty(&fc->pending))
633 req = list_entry(fc->pending.next, struct fuse_req, list);
634 list_del_init(&req->list);
635 spin_unlock(&fuse_lock);
638 reqsize = req->in.h.len;
639 fuse_copy_init(&cs, 1, req, iov, nr_segs);
641 if (iov_length(iov, nr_segs) >= reqsize) {
642 err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
644 err = fuse_copy_args(&cs, in->numargs, in->argpages,
645 (struct fuse_arg *) in->args, 0);
647 fuse_copy_finish(&cs);
649 spin_lock(&fuse_lock);
651 if (!err && req->interrupted)
654 if (!req->interrupted)
655 req->out.h.error = -EIO;
656 request_end(fc, req);
660 request_end(fc, req);
663 list_add_tail(&req->list, &fc->processing);
664 spin_unlock(&fuse_lock);
669 spin_unlock(&fuse_lock);
673 static ssize_t fuse_dev_read(struct file *file, char __user *buf,
674 size_t nbytes, loff_t *off)
677 iov.iov_len = nbytes;
679 return fuse_dev_readv(file, &iov, 1, off);
682 /* Look up request on processing list by unique ID */
683 static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
685 struct list_head *entry;
687 list_for_each(entry, &fc->processing) {
688 struct fuse_req *req;
689 req = list_entry(entry, struct fuse_req, list);
690 if (req->in.h.unique == unique)
696 static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
699 unsigned reqsize = sizeof(struct fuse_out_header);
702 return nbytes != reqsize ? -EINVAL : 0;
704 reqsize += len_args(out->numargs, out->args);
706 if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
708 else if (reqsize > nbytes) {
709 struct fuse_arg *lastarg = &out->args[out->numargs-1];
710 unsigned diffsize = reqsize - nbytes;
711 if (diffsize > lastarg->size)
713 lastarg->size -= diffsize;
715 return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
720 * Write a single reply to a request. First the header is copied from
721 * the write buffer. The request is then searched on the processing
722 * list by the unique ID found in the header. If found, then remove
723 * it from the list and copy the rest of the buffer to the request.
724 * The request is finished by calling request_end()
726 static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
727 unsigned long nr_segs, loff_t *off)
730 unsigned nbytes = iov_length(iov, nr_segs);
731 struct fuse_req *req;
732 struct fuse_out_header oh;
733 struct fuse_copy_state cs;
734 struct fuse_conn *fc = fuse_get_conn(file);
738 fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
739 if (nbytes < sizeof(struct fuse_out_header))
742 err = fuse_copy_one(&cs, &oh, sizeof(oh));
746 if (!oh.unique || oh.error <= -1000 || oh.error > 0 ||
750 spin_lock(&fuse_lock);
751 req = request_find(fc, oh.unique);
756 list_del_init(&req->list);
757 if (req->interrupted) {
758 request_end(fc, req);
759 fuse_copy_finish(&cs);
765 spin_unlock(&fuse_lock);
767 err = copy_out_args(&cs, &req->out, nbytes);
768 fuse_copy_finish(&cs);
770 spin_lock(&fuse_lock);
773 if (req->interrupted)
775 } else if (!req->interrupted)
776 req->out.h.error = -EIO;
777 request_end(fc, req);
779 return err ? err : nbytes;
782 spin_unlock(&fuse_lock);
784 fuse_copy_finish(&cs);
788 static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
789 size_t nbytes, loff_t *off)
792 iov.iov_len = nbytes;
793 iov.iov_base = (char __user *) buf;
794 return fuse_dev_writev(file, &iov, 1, off);
797 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
799 struct fuse_conn *fc = fuse_get_conn(file);
800 unsigned mask = POLLOUT | POLLWRNORM;
805 poll_wait(file, &fc->waitq, wait);
807 spin_lock(&fuse_lock);
808 if (!list_empty(&fc->pending))
809 mask |= POLLIN | POLLRDNORM;
810 spin_unlock(&fuse_lock);
815 /* Abort all requests on the given list (pending or processing) */
816 static void end_requests(struct fuse_conn *fc, struct list_head *head)
818 while (!list_empty(head)) {
819 struct fuse_req *req;
820 req = list_entry(head->next, struct fuse_req, list);
821 list_del_init(&req->list);
822 req->out.h.error = -ECONNABORTED;
823 request_end(fc, req);
824 spin_lock(&fuse_lock);
828 static int fuse_dev_release(struct inode *inode, struct file *file)
830 struct fuse_conn *fc;
832 spin_lock(&fuse_lock);
833 fc = file->private_data;
836 end_requests(fc, &fc->pending);
837 end_requests(fc, &fc->processing);
838 fuse_release_conn(fc);
840 spin_unlock(&fuse_lock);
844 struct file_operations fuse_dev_operations = {
845 .owner = THIS_MODULE,
847 .read = fuse_dev_read,
848 .readv = fuse_dev_readv,
849 .write = fuse_dev_write,
850 .writev = fuse_dev_writev,
851 .poll = fuse_dev_poll,
852 .release = fuse_dev_release,
855 static struct miscdevice fuse_miscdevice = {
858 .fops = &fuse_dev_operations,
861 int __init fuse_dev_init(void)
864 fuse_req_cachep = kmem_cache_create("fuse_request",
865 sizeof(struct fuse_req),
867 if (!fuse_req_cachep)
870 err = misc_register(&fuse_miscdevice);
872 goto out_cache_clean;
877 kmem_cache_destroy(fuse_req_cachep);
882 void fuse_dev_cleanup(void)
884 misc_deregister(&fuse_miscdevice);
885 kmem_cache_destroy(fuse_req_cachep);