Merge with rsync://rsync.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
[linux-2.6] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/signal.h>
89 #include <linux/sched.h>
90 #include <linux/errno.h>
91 #include <linux/string.h>
92 #include <linux/stat.h>
93 #include <linux/dcache.h>
94 #include <linux/namei.h>
95 #include <linux/socket.h>
96 #include <linux/un.h>
97 #include <linux/fcntl.h>
98 #include <linux/termios.h>
99 #include <linux/sockios.h>
100 #include <linux/net.h>
101 #include <linux/in.h>
102 #include <linux/fs.h>
103 #include <linux/slab.h>
104 #include <asm/uaccess.h>
105 #include <linux/skbuff.h>
106 #include <linux/netdevice.h>
107 #include <net/sock.h>
108 #include <net/tcp_states.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/smp_lock.h>
116 #include <linux/rtnetlink.h>
117 #include <linux/mount.h>
118 #include <net/checksum.h>
119 #include <linux/security.h>
120
121 int sysctl_unix_max_dgram_qlen = 10;
122
123 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124 DEFINE_SPINLOCK(unix_table_lock);
125 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126
127 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
128
129 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
130
131 /*
132  *  SMP locking strategy:
133  *    hash table is protected with spinlock unix_table_lock
134  *    each socket state is protected by separate rwlock.
135  */
136
137 static inline unsigned unix_hash_fold(unsigned hash)
138 {
139         hash ^= hash>>16;
140         hash ^= hash>>8;
141         return hash&(UNIX_HASH_SIZE-1);
142 }
143
144 #define unix_peer(sk) (unix_sk(sk)->peer)
145
146 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
147 {
148         return unix_peer(osk) == sk;
149 }
150
151 static inline int unix_may_send(struct sock *sk, struct sock *osk)
152 {
153         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
154 }
155
156 static struct sock *unix_peer_get(struct sock *s)
157 {
158         struct sock *peer;
159
160         unix_state_rlock(s);
161         peer = unix_peer(s);
162         if (peer)
163                 sock_hold(peer);
164         unix_state_runlock(s);
165         return peer;
166 }
167
168 static inline void unix_release_addr(struct unix_address *addr)
169 {
170         if (atomic_dec_and_test(&addr->refcnt))
171                 kfree(addr);
172 }
173
174 /*
175  *      Check unix socket name:
176  *              - should be not zero length.
177  *              - if started by not zero, should be NULL terminated (FS object)
178  *              - if started by zero, it is abstract name.
179  */
180  
181 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
182 {
183         if (len <= sizeof(short) || len > sizeof(*sunaddr))
184                 return -EINVAL;
185         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
186                 return -EINVAL;
187         if (sunaddr->sun_path[0]) {
188                 /*
189                  * This may look like an off by one error but it is a bit more
190                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
191                  * sun_path[108] doesnt as such exist.  However in kernel space
192                  * we are guaranteed that it is a valid memory location in our
193                  * kernel address buffer.
194                  */
195                 ((char *)sunaddr)[len]=0;
196                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
197                 return len;
198         }
199
200         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
201         return len;
202 }
203
204 static void __unix_remove_socket(struct sock *sk)
205 {
206         sk_del_node_init(sk);
207 }
208
209 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
210 {
211         BUG_TRAP(sk_unhashed(sk));
212         sk_add_node(sk, list);
213 }
214
215 static inline void unix_remove_socket(struct sock *sk)
216 {
217         spin_lock(&unix_table_lock);
218         __unix_remove_socket(sk);
219         spin_unlock(&unix_table_lock);
220 }
221
222 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
223 {
224         spin_lock(&unix_table_lock);
225         __unix_insert_socket(list, sk);
226         spin_unlock(&unix_table_lock);
227 }
228
229 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
230                                               int len, int type, unsigned hash)
231 {
232         struct sock *s;
233         struct hlist_node *node;
234
235         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
236                 struct unix_sock *u = unix_sk(s);
237
238                 if (u->addr->len == len &&
239                     !memcmp(u->addr->name, sunname, len))
240                         goto found;
241         }
242         s = NULL;
243 found:
244         return s;
245 }
246
247 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
248                                                    int len, int type,
249                                                    unsigned hash)
250 {
251         struct sock *s;
252
253         spin_lock(&unix_table_lock);
254         s = __unix_find_socket_byname(sunname, len, type, hash);
255         if (s)
256                 sock_hold(s);
257         spin_unlock(&unix_table_lock);
258         return s;
259 }
260
261 static struct sock *unix_find_socket_byinode(struct inode *i)
262 {
263         struct sock *s;
264         struct hlist_node *node;
265
266         spin_lock(&unix_table_lock);
267         sk_for_each(s, node,
268                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
269                 struct dentry *dentry = unix_sk(s)->dentry;
270
271                 if(dentry && dentry->d_inode == i)
272                 {
273                         sock_hold(s);
274                         goto found;
275                 }
276         }
277         s = NULL;
278 found:
279         spin_unlock(&unix_table_lock);
280         return s;
281 }
282
283 static inline int unix_writable(struct sock *sk)
284 {
285         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
286 }
287
288 static void unix_write_space(struct sock *sk)
289 {
290         read_lock(&sk->sk_callback_lock);
291         if (unix_writable(sk)) {
292                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
293                         wake_up_interruptible(sk->sk_sleep);
294                 sk_wake_async(sk, 2, POLL_OUT);
295         }
296         read_unlock(&sk->sk_callback_lock);
297 }
298
299 /* When dgram socket disconnects (or changes its peer), we clear its receive
300  * queue of packets arrived from previous peer. First, it allows to do
301  * flow control based only on wmem_alloc; second, sk connected to peer
302  * may receive messages only from that peer. */
303 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304 {
305         if (!skb_queue_empty(&sk->sk_receive_queue)) {
306                 skb_queue_purge(&sk->sk_receive_queue);
307                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308
309                 /* If one link of bidirectional dgram pipe is disconnected,
310                  * we signal error. Messages are lost. Do not make this,
311                  * when peer was not connected to us.
312                  */
313                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
314                         other->sk_err = ECONNRESET;
315                         other->sk_error_report(other);
316                 }
317         }
318 }
319
320 static void unix_sock_destructor(struct sock *sk)
321 {
322         struct unix_sock *u = unix_sk(sk);
323
324         skb_queue_purge(&sk->sk_receive_queue);
325
326         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
327         BUG_TRAP(sk_unhashed(sk));
328         BUG_TRAP(!sk->sk_socket);
329         if (!sock_flag(sk, SOCK_DEAD)) {
330                 printk("Attempt to release alive unix socket: %p\n", sk);
331                 return;
332         }
333
334         if (u->addr)
335                 unix_release_addr(u->addr);
336
337         atomic_dec(&unix_nr_socks);
338 #ifdef UNIX_REFCNT_DEBUG
339         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
340 #endif
341 }
342
343 static int unix_release_sock (struct sock *sk, int embrion)
344 {
345         struct unix_sock *u = unix_sk(sk);
346         struct dentry *dentry;
347         struct vfsmount *mnt;
348         struct sock *skpair;
349         struct sk_buff *skb;
350         int state;
351
352         unix_remove_socket(sk);
353
354         /* Clear state */
355         unix_state_wlock(sk);
356         sock_orphan(sk);
357         sk->sk_shutdown = SHUTDOWN_MASK;
358         dentry       = u->dentry;
359         u->dentry    = NULL;
360         mnt          = u->mnt;
361         u->mnt       = NULL;
362         state = sk->sk_state;
363         sk->sk_state = TCP_CLOSE;
364         unix_state_wunlock(sk);
365
366         wake_up_interruptible_all(&u->peer_wait);
367
368         skpair=unix_peer(sk);
369
370         if (skpair!=NULL) {
371                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
372                         unix_state_wlock(skpair);
373                         /* No more writes */
374                         skpair->sk_shutdown = SHUTDOWN_MASK;
375                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
376                                 skpair->sk_err = ECONNRESET;
377                         unix_state_wunlock(skpair);
378                         skpair->sk_state_change(skpair);
379                         read_lock(&skpair->sk_callback_lock);
380                         sk_wake_async(skpair,1,POLL_HUP);
381                         read_unlock(&skpair->sk_callback_lock);
382                 }
383                 sock_put(skpair); /* It may now die */
384                 unix_peer(sk) = NULL;
385         }
386
387         /* Try to flush out this socket. Throw out buffers at least */
388
389         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
390                 if (state==TCP_LISTEN)
391                         unix_release_sock(skb->sk, 1);
392                 /* passed fds are erased in the kfree_skb hook        */
393                 kfree_skb(skb);
394         }
395
396         if (dentry) {
397                 dput(dentry);
398                 mntput(mnt);
399         }
400
401         sock_put(sk);
402
403         /* ---- Socket is dead now and most probably destroyed ---- */
404
405         /*
406          * Fixme: BSD difference: In BSD all sockets connected to use get
407          *        ECONNRESET and we die on the spot. In Linux we behave
408          *        like files and pipes do and wait for the last
409          *        dereference.
410          *
411          * Can't we simply set sock->err?
412          *
413          *        What the above comment does talk about? --ANK(980817)
414          */
415
416         if (atomic_read(&unix_tot_inflight))
417                 unix_gc();              /* Garbage collect fds */       
418
419         return 0;
420 }
421
422 static int unix_listen(struct socket *sock, int backlog)
423 {
424         int err;
425         struct sock *sk = sock->sk;
426         struct unix_sock *u = unix_sk(sk);
427
428         err = -EOPNOTSUPP;
429         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
430                 goto out;                       /* Only stream/seqpacket sockets accept */
431         err = -EINVAL;
432         if (!u->addr)
433                 goto out;                       /* No listens on an unbound socket */
434         unix_state_wlock(sk);
435         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
436                 goto out_unlock;
437         if (backlog > sk->sk_max_ack_backlog)
438                 wake_up_interruptible_all(&u->peer_wait);
439         sk->sk_max_ack_backlog  = backlog;
440         sk->sk_state            = TCP_LISTEN;
441         /* set credentials so connect can copy them */
442         sk->sk_peercred.pid     = current->tgid;
443         sk->sk_peercred.uid     = current->euid;
444         sk->sk_peercred.gid     = current->egid;
445         err = 0;
446
447 out_unlock:
448         unix_state_wunlock(sk);
449 out:
450         return err;
451 }
452
453 static int unix_release(struct socket *);
454 static int unix_bind(struct socket *, struct sockaddr *, int);
455 static int unix_stream_connect(struct socket *, struct sockaddr *,
456                                int addr_len, int flags);
457 static int unix_socketpair(struct socket *, struct socket *);
458 static int unix_accept(struct socket *, struct socket *, int);
459 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
460 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
461 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
462 static int unix_shutdown(struct socket *, int);
463 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
464                                struct msghdr *, size_t);
465 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
466                                struct msghdr *, size_t, int);
467 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
468                               struct msghdr *, size_t);
469 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
470                               struct msghdr *, size_t, int);
471 static int unix_dgram_connect(struct socket *, struct sockaddr *,
472                               int, int);
473 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
474                                   struct msghdr *, size_t);
475
476 static const struct proto_ops unix_stream_ops = {
477         .family =       PF_UNIX,
478         .owner =        THIS_MODULE,
479         .release =      unix_release,
480         .bind =         unix_bind,
481         .connect =      unix_stream_connect,
482         .socketpair =   unix_socketpair,
483         .accept =       unix_accept,
484         .getname =      unix_getname,
485         .poll =         unix_poll,
486         .ioctl =        unix_ioctl,
487         .listen =       unix_listen,
488         .shutdown =     unix_shutdown,
489         .setsockopt =   sock_no_setsockopt,
490         .getsockopt =   sock_no_getsockopt,
491         .sendmsg =      unix_stream_sendmsg,
492         .recvmsg =      unix_stream_recvmsg,
493         .mmap =         sock_no_mmap,
494         .sendpage =     sock_no_sendpage,
495 };
496
497 static const struct proto_ops unix_dgram_ops = {
498         .family =       PF_UNIX,
499         .owner =        THIS_MODULE,
500         .release =      unix_release,
501         .bind =         unix_bind,
502         .connect =      unix_dgram_connect,
503         .socketpair =   unix_socketpair,
504         .accept =       sock_no_accept,
505         .getname =      unix_getname,
506         .poll =         datagram_poll,
507         .ioctl =        unix_ioctl,
508         .listen =       sock_no_listen,
509         .shutdown =     unix_shutdown,
510         .setsockopt =   sock_no_setsockopt,
511         .getsockopt =   sock_no_getsockopt,
512         .sendmsg =      unix_dgram_sendmsg,
513         .recvmsg =      unix_dgram_recvmsg,
514         .mmap =         sock_no_mmap,
515         .sendpage =     sock_no_sendpage,
516 };
517
518 static const struct proto_ops unix_seqpacket_ops = {
519         .family =       PF_UNIX,
520         .owner =        THIS_MODULE,
521         .release =      unix_release,
522         .bind =         unix_bind,
523         .connect =      unix_stream_connect,
524         .socketpair =   unix_socketpair,
525         .accept =       unix_accept,
526         .getname =      unix_getname,
527         .poll =         datagram_poll,
528         .ioctl =        unix_ioctl,
529         .listen =       unix_listen,
530         .shutdown =     unix_shutdown,
531         .setsockopt =   sock_no_setsockopt,
532         .getsockopt =   sock_no_getsockopt,
533         .sendmsg =      unix_seqpacket_sendmsg,
534         .recvmsg =      unix_dgram_recvmsg,
535         .mmap =         sock_no_mmap,
536         .sendpage =     sock_no_sendpage,
537 };
538
539 static struct proto unix_proto = {
540         .name     = "UNIX",
541         .owner    = THIS_MODULE,
542         .obj_size = sizeof(struct unix_sock),
543 };
544
545 static struct sock * unix_create1(struct socket *sock)
546 {
547         struct sock *sk = NULL;
548         struct unix_sock *u;
549
550         if (atomic_read(&unix_nr_socks) >= 2*get_max_files())
551                 goto out;
552
553         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
554         if (!sk)
555                 goto out;
556
557         atomic_inc(&unix_nr_socks);
558
559         sock_init_data(sock,sk);
560
561         sk->sk_write_space      = unix_write_space;
562         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
563         sk->sk_destruct         = unix_sock_destructor;
564         u         = unix_sk(sk);
565         u->dentry = NULL;
566         u->mnt    = NULL;
567         spin_lock_init(&u->lock);
568         atomic_set(&u->inflight, sock ? 0 : -1);
569         mutex_init(&u->readlock); /* single task reading lock */
570         init_waitqueue_head(&u->peer_wait);
571         unix_insert_socket(unix_sockets_unbound, sk);
572 out:
573         return sk;
574 }
575
576 static int unix_create(struct socket *sock, int protocol)
577 {
578         if (protocol && protocol != PF_UNIX)
579                 return -EPROTONOSUPPORT;
580
581         sock->state = SS_UNCONNECTED;
582
583         switch (sock->type) {
584         case SOCK_STREAM:
585                 sock->ops = &unix_stream_ops;
586                 break;
587                 /*
588                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
589                  *      nothing uses it.
590                  */
591         case SOCK_RAW:
592                 sock->type=SOCK_DGRAM;
593         case SOCK_DGRAM:
594                 sock->ops = &unix_dgram_ops;
595                 break;
596         case SOCK_SEQPACKET:
597                 sock->ops = &unix_seqpacket_ops;
598                 break;
599         default:
600                 return -ESOCKTNOSUPPORT;
601         }
602
603         return unix_create1(sock) ? 0 : -ENOMEM;
604 }
605
606 static int unix_release(struct socket *sock)
607 {
608         struct sock *sk = sock->sk;
609
610         if (!sk)
611                 return 0;
612
613         sock->sk = NULL;
614
615         return unix_release_sock (sk, 0);
616 }
617
618 static int unix_autobind(struct socket *sock)
619 {
620         struct sock *sk = sock->sk;
621         struct unix_sock *u = unix_sk(sk);
622         static u32 ordernum = 1;
623         struct unix_address * addr;
624         int err;
625
626         mutex_lock(&u->readlock);
627
628         err = 0;
629         if (u->addr)
630                 goto out;
631
632         err = -ENOMEM;
633         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
634         if (!addr)
635                 goto out;
636
637         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
638         addr->name->sun_family = AF_UNIX;
639         atomic_set(&addr->refcnt, 1);
640
641 retry:
642         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
643         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
644
645         spin_lock(&unix_table_lock);
646         ordernum = (ordernum+1)&0xFFFFF;
647
648         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
649                                       addr->hash)) {
650                 spin_unlock(&unix_table_lock);
651                 /* Sanity yield. It is unusual case, but yet... */
652                 if (!(ordernum&0xFF))
653                         yield();
654                 goto retry;
655         }
656         addr->hash ^= sk->sk_type;
657
658         __unix_remove_socket(sk);
659         u->addr = addr;
660         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
661         spin_unlock(&unix_table_lock);
662         err = 0;
663
664 out:    mutex_unlock(&u->readlock);
665         return err;
666 }
667
668 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
669                                     int type, unsigned hash, int *error)
670 {
671         struct sock *u;
672         struct nameidata nd;
673         int err = 0;
674         
675         if (sunname->sun_path[0]) {
676                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
677                 if (err)
678                         goto fail;
679                 err = vfs_permission(&nd, MAY_WRITE);
680                 if (err)
681                         goto put_fail;
682
683                 err = -ECONNREFUSED;
684                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
685                         goto put_fail;
686                 u=unix_find_socket_byinode(nd.dentry->d_inode);
687                 if (!u)
688                         goto put_fail;
689
690                 if (u->sk_type == type)
691                         touch_atime(nd.mnt, nd.dentry);
692
693                 path_release(&nd);
694
695                 err=-EPROTOTYPE;
696                 if (u->sk_type != type) {
697                         sock_put(u);
698                         goto fail;
699                 }
700         } else {
701                 err = -ECONNREFUSED;
702                 u=unix_find_socket_byname(sunname, len, type, hash);
703                 if (u) {
704                         struct dentry *dentry;
705                         dentry = unix_sk(u)->dentry;
706                         if (dentry)
707                                 touch_atime(unix_sk(u)->mnt, dentry);
708                 } else
709                         goto fail;
710         }
711         return u;
712
713 put_fail:
714         path_release(&nd);
715 fail:
716         *error=err;
717         return NULL;
718 }
719
720
721 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
722 {
723         struct sock *sk = sock->sk;
724         struct unix_sock *u = unix_sk(sk);
725         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
726         struct dentry * dentry = NULL;
727         struct nameidata nd;
728         int err;
729         unsigned hash;
730         struct unix_address *addr;
731         struct hlist_head *list;
732
733         err = -EINVAL;
734         if (sunaddr->sun_family != AF_UNIX)
735                 goto out;
736
737         if (addr_len==sizeof(short)) {
738                 err = unix_autobind(sock);
739                 goto out;
740         }
741
742         err = unix_mkname(sunaddr, addr_len, &hash);
743         if (err < 0)
744                 goto out;
745         addr_len = err;
746
747         mutex_lock(&u->readlock);
748
749         err = -EINVAL;
750         if (u->addr)
751                 goto out_up;
752
753         err = -ENOMEM;
754         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
755         if (!addr)
756                 goto out_up;
757
758         memcpy(addr->name, sunaddr, addr_len);
759         addr->len = addr_len;
760         addr->hash = hash ^ sk->sk_type;
761         atomic_set(&addr->refcnt, 1);
762
763         if (sunaddr->sun_path[0]) {
764                 unsigned int mode;
765                 err = 0;
766                 /*
767                  * Get the parent directory, calculate the hash for last
768                  * component.
769                  */
770                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
771                 if (err)
772                         goto out_mknod_parent;
773
774                 dentry = lookup_create(&nd, 0);
775                 err = PTR_ERR(dentry);
776                 if (IS_ERR(dentry))
777                         goto out_mknod_unlock;
778
779                 /*
780                  * All right, let's create it.
781                  */
782                 mode = S_IFSOCK |
783                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
784                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
785                 if (err)
786                         goto out_mknod_dput;
787                 mutex_unlock(&nd.dentry->d_inode->i_mutex);
788                 dput(nd.dentry);
789                 nd.dentry = dentry;
790
791                 addr->hash = UNIX_HASH_SIZE;
792         }
793
794         spin_lock(&unix_table_lock);
795
796         if (!sunaddr->sun_path[0]) {
797                 err = -EADDRINUSE;
798                 if (__unix_find_socket_byname(sunaddr, addr_len,
799                                               sk->sk_type, hash)) {
800                         unix_release_addr(addr);
801                         goto out_unlock;
802                 }
803
804                 list = &unix_socket_table[addr->hash];
805         } else {
806                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
807                 u->dentry = nd.dentry;
808                 u->mnt    = nd.mnt;
809         }
810
811         err = 0;
812         __unix_remove_socket(sk);
813         u->addr = addr;
814         __unix_insert_socket(list, sk);
815
816 out_unlock:
817         spin_unlock(&unix_table_lock);
818 out_up:
819         mutex_unlock(&u->readlock);
820 out:
821         return err;
822
823 out_mknod_dput:
824         dput(dentry);
825 out_mknod_unlock:
826         mutex_unlock(&nd.dentry->d_inode->i_mutex);
827         path_release(&nd);
828 out_mknod_parent:
829         if (err==-EEXIST)
830                 err=-EADDRINUSE;
831         unix_release_addr(addr);
832         goto out_up;
833 }
834
835 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
836                               int alen, int flags)
837 {
838         struct sock *sk = sock->sk;
839         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
840         struct sock *other;
841         unsigned hash;
842         int err;
843
844         if (addr->sa_family != AF_UNSPEC) {
845                 err = unix_mkname(sunaddr, alen, &hash);
846                 if (err < 0)
847                         goto out;
848                 alen = err;
849
850                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
851                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
852                         goto out;
853
854                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
855                 if (!other)
856                         goto out;
857
858                 unix_state_wlock(sk);
859
860                 err = -EPERM;
861                 if (!unix_may_send(sk, other))
862                         goto out_unlock;
863
864                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
865                 if (err)
866                         goto out_unlock;
867
868         } else {
869                 /*
870                  *      1003.1g breaking connected state with AF_UNSPEC
871                  */
872                 other = NULL;
873                 unix_state_wlock(sk);
874         }
875
876         /*
877          * If it was connected, reconnect.
878          */
879         if (unix_peer(sk)) {
880                 struct sock *old_peer = unix_peer(sk);
881                 unix_peer(sk)=other;
882                 unix_state_wunlock(sk);
883
884                 if (other != old_peer)
885                         unix_dgram_disconnected(sk, old_peer);
886                 sock_put(old_peer);
887         } else {
888                 unix_peer(sk)=other;
889                 unix_state_wunlock(sk);
890         }
891         return 0;
892
893 out_unlock:
894         unix_state_wunlock(sk);
895         sock_put(other);
896 out:
897         return err;
898 }
899
900 static long unix_wait_for_peer(struct sock *other, long timeo)
901 {
902         struct unix_sock *u = unix_sk(other);
903         int sched;
904         DEFINE_WAIT(wait);
905
906         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
907
908         sched = !sock_flag(other, SOCK_DEAD) &&
909                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
910                 (skb_queue_len(&other->sk_receive_queue) >
911                  other->sk_max_ack_backlog);
912
913         unix_state_runlock(other);
914
915         if (sched)
916                 timeo = schedule_timeout(timeo);
917
918         finish_wait(&u->peer_wait, &wait);
919         return timeo;
920 }
921
922 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
923                                int addr_len, int flags)
924 {
925         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
926         struct sock *sk = sock->sk;
927         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
928         struct sock *newsk = NULL;
929         struct sock *other = NULL;
930         struct sk_buff *skb = NULL;
931         unsigned hash;
932         int st;
933         int err;
934         long timeo;
935
936         err = unix_mkname(sunaddr, addr_len, &hash);
937         if (err < 0)
938                 goto out;
939         addr_len = err;
940
941         if (test_bit(SOCK_PASSCRED, &sock->flags)
942                 && !u->addr && (err = unix_autobind(sock)) != 0)
943                 goto out;
944
945         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
946
947         /* First of all allocate resources.
948            If we will make it after state is locked,
949            we will have to recheck all again in any case.
950          */
951
952         err = -ENOMEM;
953
954         /* create new sock for complete connection */
955         newsk = unix_create1(NULL);
956         if (newsk == NULL)
957                 goto out;
958
959         /* Allocate skb for sending to listening sock */
960         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
961         if (skb == NULL)
962                 goto out;
963
964 restart:
965         /*  Find listening sock. */
966         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
967         if (!other)
968                 goto out;
969
970         /* Latch state of peer */
971         unix_state_rlock(other);
972
973         /* Apparently VFS overslept socket death. Retry. */
974         if (sock_flag(other, SOCK_DEAD)) {
975                 unix_state_runlock(other);
976                 sock_put(other);
977                 goto restart;
978         }
979
980         err = -ECONNREFUSED;
981         if (other->sk_state != TCP_LISTEN)
982                 goto out_unlock;
983
984         if (skb_queue_len(&other->sk_receive_queue) >
985             other->sk_max_ack_backlog) {
986                 err = -EAGAIN;
987                 if (!timeo)
988                         goto out_unlock;
989
990                 timeo = unix_wait_for_peer(other, timeo);
991
992                 err = sock_intr_errno(timeo);
993                 if (signal_pending(current))
994                         goto out;
995                 sock_put(other);
996                 goto restart;
997         }
998
999         /* Latch our state.
1000
1001            It is tricky place. We need to grab write lock and cannot
1002            drop lock on peer. It is dangerous because deadlock is
1003            possible. Connect to self case and simultaneous
1004            attempt to connect are eliminated by checking socket
1005            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1006            check this before attempt to grab lock.
1007
1008            Well, and we have to recheck the state after socket locked.
1009          */
1010         st = sk->sk_state;
1011
1012         switch (st) {
1013         case TCP_CLOSE:
1014                 /* This is ok... continue with connect */
1015                 break;
1016         case TCP_ESTABLISHED:
1017                 /* Socket is already connected */
1018                 err = -EISCONN;
1019                 goto out_unlock;
1020         default:
1021                 err = -EINVAL;
1022                 goto out_unlock;
1023         }
1024
1025         unix_state_wlock(sk);
1026
1027         if (sk->sk_state != st) {
1028                 unix_state_wunlock(sk);
1029                 unix_state_runlock(other);
1030                 sock_put(other);
1031                 goto restart;
1032         }
1033
1034         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1035         if (err) {
1036                 unix_state_wunlock(sk);
1037                 goto out_unlock;
1038         }
1039
1040         /* The way is open! Fastly set all the necessary fields... */
1041
1042         sock_hold(sk);
1043         unix_peer(newsk)        = sk;
1044         newsk->sk_state         = TCP_ESTABLISHED;
1045         newsk->sk_type          = sk->sk_type;
1046         newsk->sk_peercred.pid  = current->tgid;
1047         newsk->sk_peercred.uid  = current->euid;
1048         newsk->sk_peercred.gid  = current->egid;
1049         newu = unix_sk(newsk);
1050         newsk->sk_sleep         = &newu->peer_wait;
1051         otheru = unix_sk(other);
1052
1053         /* copy address information from listening to new sock*/
1054         if (otheru->addr) {
1055                 atomic_inc(&otheru->addr->refcnt);
1056                 newu->addr = otheru->addr;
1057         }
1058         if (otheru->dentry) {
1059                 newu->dentry    = dget(otheru->dentry);
1060                 newu->mnt       = mntget(otheru->mnt);
1061         }
1062
1063         /* Set credentials */
1064         sk->sk_peercred = other->sk_peercred;
1065
1066         sock->state     = SS_CONNECTED;
1067         sk->sk_state    = TCP_ESTABLISHED;
1068         sock_hold(newsk);
1069
1070         smp_mb__after_atomic_inc();     /* sock_hold() does an atomic_inc() */
1071         unix_peer(sk)   = newsk;
1072
1073         unix_state_wunlock(sk);
1074
1075         /* take ten and and send info to listening sock */
1076         spin_lock(&other->sk_receive_queue.lock);
1077         __skb_queue_tail(&other->sk_receive_queue, skb);
1078         /* Undo artificially decreased inflight after embrion
1079          * is installed to listening socket. */
1080         atomic_inc(&newu->inflight);
1081         spin_unlock(&other->sk_receive_queue.lock);
1082         unix_state_runlock(other);
1083         other->sk_data_ready(other, 0);
1084         sock_put(other);
1085         return 0;
1086
1087 out_unlock:
1088         if (other)
1089                 unix_state_runlock(other);
1090
1091 out:
1092         if (skb)
1093                 kfree_skb(skb);
1094         if (newsk)
1095                 unix_release_sock(newsk, 0);
1096         if (other)
1097                 sock_put(other);
1098         return err;
1099 }
1100
1101 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1102 {
1103         struct sock *ska=socka->sk, *skb = sockb->sk;
1104
1105         /* Join our sockets back to back */
1106         sock_hold(ska);
1107         sock_hold(skb);
1108         unix_peer(ska)=skb;
1109         unix_peer(skb)=ska;
1110         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1111         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1112         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1113
1114         if (ska->sk_type != SOCK_DGRAM) {
1115                 ska->sk_state = TCP_ESTABLISHED;
1116                 skb->sk_state = TCP_ESTABLISHED;
1117                 socka->state  = SS_CONNECTED;
1118                 sockb->state  = SS_CONNECTED;
1119         }
1120         return 0;
1121 }
1122
1123 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1124 {
1125         struct sock *sk = sock->sk;
1126         struct sock *tsk;
1127         struct sk_buff *skb;
1128         int err;
1129
1130         err = -EOPNOTSUPP;
1131         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1132                 goto out;
1133
1134         err = -EINVAL;
1135         if (sk->sk_state != TCP_LISTEN)
1136                 goto out;
1137
1138         /* If socket state is TCP_LISTEN it cannot change (for now...),
1139          * so that no locks are necessary.
1140          */
1141
1142         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1143         if (!skb) {
1144                 /* This means receive shutdown. */
1145                 if (err == 0)
1146                         err = -EINVAL;
1147                 goto out;
1148         }
1149
1150         tsk = skb->sk;
1151         skb_free_datagram(sk, skb);
1152         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1153
1154         /* attach accepted sock to socket */
1155         unix_state_wlock(tsk);
1156         newsock->state = SS_CONNECTED;
1157         sock_graft(tsk, newsock);
1158         unix_state_wunlock(tsk);
1159         return 0;
1160
1161 out:
1162         return err;
1163 }
1164
1165
1166 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1167 {
1168         struct sock *sk = sock->sk;
1169         struct unix_sock *u;
1170         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1171         int err = 0;
1172
1173         if (peer) {
1174                 sk = unix_peer_get(sk);
1175
1176                 err = -ENOTCONN;
1177                 if (!sk)
1178                         goto out;
1179                 err = 0;
1180         } else {
1181                 sock_hold(sk);
1182         }
1183
1184         u = unix_sk(sk);
1185         unix_state_rlock(sk);
1186         if (!u->addr) {
1187                 sunaddr->sun_family = AF_UNIX;
1188                 sunaddr->sun_path[0] = 0;
1189                 *uaddr_len = sizeof(short);
1190         } else {
1191                 struct unix_address *addr = u->addr;
1192
1193                 *uaddr_len = addr->len;
1194                 memcpy(sunaddr, addr->name, *uaddr_len);
1195         }
1196         unix_state_runlock(sk);
1197         sock_put(sk);
1198 out:
1199         return err;
1200 }
1201
1202 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1203 {
1204         int i;
1205
1206         scm->fp = UNIXCB(skb).fp;
1207         skb->destructor = sock_wfree;
1208         UNIXCB(skb).fp = NULL;
1209
1210         for (i=scm->fp->count-1; i>=0; i--)
1211                 unix_notinflight(scm->fp->fp[i]);
1212 }
1213
1214 static void unix_destruct_fds(struct sk_buff *skb)
1215 {
1216         struct scm_cookie scm;
1217         memset(&scm, 0, sizeof(scm));
1218         unix_detach_fds(&scm, skb);
1219
1220         /* Alas, it calls VFS */
1221         /* So fscking what? fput() had been SMP-safe since the last Summer */
1222         scm_destroy(&scm);
1223         sock_wfree(skb);
1224 }
1225
1226 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1227 {
1228         int i;
1229         for (i=scm->fp->count-1; i>=0; i--)
1230                 unix_inflight(scm->fp->fp[i]);
1231         UNIXCB(skb).fp = scm->fp;
1232         skb->destructor = unix_destruct_fds;
1233         scm->fp = NULL;
1234 }
1235
1236 /*
1237  *      Send AF_UNIX data.
1238  */
1239
1240 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1241                               struct msghdr *msg, size_t len)
1242 {
1243         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1244         struct sock *sk = sock->sk;
1245         struct unix_sock *u = unix_sk(sk);
1246         struct sockaddr_un *sunaddr=msg->msg_name;
1247         struct sock *other = NULL;
1248         int namelen = 0; /* fake GCC */
1249         int err;
1250         unsigned hash;
1251         struct sk_buff *skb;
1252         long timeo;
1253         struct scm_cookie tmp_scm;
1254
1255         if (NULL == siocb->scm)
1256                 siocb->scm = &tmp_scm;
1257         err = scm_send(sock, msg, siocb->scm);
1258         if (err < 0)
1259                 return err;
1260
1261         err = -EOPNOTSUPP;
1262         if (msg->msg_flags&MSG_OOB)
1263                 goto out;
1264
1265         if (msg->msg_namelen) {
1266                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1267                 if (err < 0)
1268                         goto out;
1269                 namelen = err;
1270         } else {
1271                 sunaddr = NULL;
1272                 err = -ENOTCONN;
1273                 other = unix_peer_get(sk);
1274                 if (!other)
1275                         goto out;
1276         }
1277
1278         if (test_bit(SOCK_PASSCRED, &sock->flags)
1279                 && !u->addr && (err = unix_autobind(sock)) != 0)
1280                 goto out;
1281
1282         err = -EMSGSIZE;
1283         if (len > sk->sk_sndbuf - 32)
1284                 goto out;
1285
1286         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1287         if (skb==NULL)
1288                 goto out;
1289
1290         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1291         if (siocb->scm->fp)
1292                 unix_attach_fds(siocb->scm, skb);
1293
1294         skb->h.raw = skb->data;
1295         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1296         if (err)
1297                 goto out_free;
1298
1299         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1300
1301 restart:
1302         if (!other) {
1303                 err = -ECONNRESET;
1304                 if (sunaddr == NULL)
1305                         goto out_free;
1306
1307                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1308                                         hash, &err);
1309                 if (other==NULL)
1310                         goto out_free;
1311         }
1312
1313         unix_state_rlock(other);
1314         err = -EPERM;
1315         if (!unix_may_send(sk, other))
1316                 goto out_unlock;
1317
1318         if (sock_flag(other, SOCK_DEAD)) {
1319                 /*
1320                  *      Check with 1003.1g - what should
1321                  *      datagram error
1322                  */
1323                 unix_state_runlock(other);
1324                 sock_put(other);
1325
1326                 err = 0;
1327                 unix_state_wlock(sk);
1328                 if (unix_peer(sk) == other) {
1329                         unix_peer(sk)=NULL;
1330                         unix_state_wunlock(sk);
1331
1332                         unix_dgram_disconnected(sk, other);
1333                         sock_put(other);
1334                         err = -ECONNREFUSED;
1335                 } else {
1336                         unix_state_wunlock(sk);
1337                 }
1338
1339                 other = NULL;
1340                 if (err)
1341                         goto out_free;
1342                 goto restart;
1343         }
1344
1345         err = -EPIPE;
1346         if (other->sk_shutdown & RCV_SHUTDOWN)
1347                 goto out_unlock;
1348
1349         if (sk->sk_type != SOCK_SEQPACKET) {
1350                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1351                 if (err)
1352                         goto out_unlock;
1353         }
1354
1355         if (unix_peer(other) != sk &&
1356             (skb_queue_len(&other->sk_receive_queue) >
1357              other->sk_max_ack_backlog)) {
1358                 if (!timeo) {
1359                         err = -EAGAIN;
1360                         goto out_unlock;
1361                 }
1362
1363                 timeo = unix_wait_for_peer(other, timeo);
1364
1365                 err = sock_intr_errno(timeo);
1366                 if (signal_pending(current))
1367                         goto out_free;
1368
1369                 goto restart;
1370         }
1371
1372         skb_queue_tail(&other->sk_receive_queue, skb);
1373         unix_state_runlock(other);
1374         other->sk_data_ready(other, len);
1375         sock_put(other);
1376         scm_destroy(siocb->scm);
1377         return len;
1378
1379 out_unlock:
1380         unix_state_runlock(other);
1381 out_free:
1382         kfree_skb(skb);
1383 out:
1384         if (other)
1385                 sock_put(other);
1386         scm_destroy(siocb->scm);
1387         return err;
1388 }
1389
1390                 
1391 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1392                                struct msghdr *msg, size_t len)
1393 {
1394         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1395         struct sock *sk = sock->sk;
1396         struct sock *other = NULL;
1397         struct sockaddr_un *sunaddr=msg->msg_name;
1398         int err,size;
1399         struct sk_buff *skb;
1400         int sent=0;
1401         struct scm_cookie tmp_scm;
1402
1403         if (NULL == siocb->scm)
1404                 siocb->scm = &tmp_scm;
1405         err = scm_send(sock, msg, siocb->scm);
1406         if (err < 0)
1407                 return err;
1408
1409         err = -EOPNOTSUPP;
1410         if (msg->msg_flags&MSG_OOB)
1411                 goto out_err;
1412
1413         if (msg->msg_namelen) {
1414                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1415                 goto out_err;
1416         } else {
1417                 sunaddr = NULL;
1418                 err = -ENOTCONN;
1419                 other = unix_peer(sk);
1420                 if (!other)
1421                         goto out_err;
1422         }
1423
1424         if (sk->sk_shutdown & SEND_SHUTDOWN)
1425                 goto pipe_err;
1426
1427         while(sent < len)
1428         {
1429                 /*
1430                  *      Optimisation for the fact that under 0.01% of X
1431                  *      messages typically need breaking up.
1432                  */
1433
1434                 size = len-sent;
1435
1436                 /* Keep two messages in the pipe so it schedules better */
1437                 if (size > ((sk->sk_sndbuf >> 1) - 64))
1438                         size = (sk->sk_sndbuf >> 1) - 64;
1439
1440                 if (size > SKB_MAX_ALLOC)
1441                         size = SKB_MAX_ALLOC;
1442                         
1443                 /*
1444                  *      Grab a buffer
1445                  */
1446                  
1447                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1448
1449                 if (skb==NULL)
1450                         goto out_err;
1451
1452                 /*
1453                  *      If you pass two values to the sock_alloc_send_skb
1454                  *      it tries to grab the large buffer with GFP_NOFS
1455                  *      (which can fail easily), and if it fails grab the
1456                  *      fallback size buffer which is under a page and will
1457                  *      succeed. [Alan]
1458                  */
1459                 size = min_t(int, size, skb_tailroom(skb));
1460
1461                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1462                 if (siocb->scm->fp)
1463                         unix_attach_fds(siocb->scm, skb);
1464
1465                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1466                         kfree_skb(skb);
1467                         goto out_err;
1468                 }
1469
1470                 unix_state_rlock(other);
1471
1472                 if (sock_flag(other, SOCK_DEAD) ||
1473                     (other->sk_shutdown & RCV_SHUTDOWN))
1474                         goto pipe_err_free;
1475
1476                 skb_queue_tail(&other->sk_receive_queue, skb);
1477                 unix_state_runlock(other);
1478                 other->sk_data_ready(other, size);
1479                 sent+=size;
1480         }
1481
1482         scm_destroy(siocb->scm);
1483         siocb->scm = NULL;
1484
1485         return sent;
1486
1487 pipe_err_free:
1488         unix_state_runlock(other);
1489         kfree_skb(skb);
1490 pipe_err:
1491         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1492                 send_sig(SIGPIPE,current,0);
1493         err = -EPIPE;
1494 out_err:
1495         scm_destroy(siocb->scm);
1496         siocb->scm = NULL;
1497         return sent ? : err;
1498 }
1499
1500 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1501                                   struct msghdr *msg, size_t len)
1502 {
1503         int err;
1504         struct sock *sk = sock->sk;
1505         
1506         err = sock_error(sk);
1507         if (err)
1508                 return err;
1509
1510         if (sk->sk_state != TCP_ESTABLISHED)
1511                 return -ENOTCONN;
1512
1513         if (msg->msg_namelen)
1514                 msg->msg_namelen = 0;
1515
1516         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1517 }
1518                                                                                             
1519 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1520 {
1521         struct unix_sock *u = unix_sk(sk);
1522
1523         msg->msg_namelen = 0;
1524         if (u->addr) {
1525                 msg->msg_namelen = u->addr->len;
1526                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1527         }
1528 }
1529
1530 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1531                               struct msghdr *msg, size_t size,
1532                               int flags)
1533 {
1534         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1535         struct scm_cookie tmp_scm;
1536         struct sock *sk = sock->sk;
1537         struct unix_sock *u = unix_sk(sk);
1538         int noblock = flags & MSG_DONTWAIT;
1539         struct sk_buff *skb;
1540         int err;
1541
1542         err = -EOPNOTSUPP;
1543         if (flags&MSG_OOB)
1544                 goto out;
1545
1546         msg->msg_namelen = 0;
1547
1548         mutex_lock(&u->readlock);
1549
1550         skb = skb_recv_datagram(sk, flags, noblock, &err);
1551         if (!skb)
1552                 goto out_unlock;
1553
1554         wake_up_interruptible(&u->peer_wait);
1555
1556         if (msg->msg_name)
1557                 unix_copy_addr(msg, skb->sk);
1558
1559         if (size > skb->len)
1560                 size = skb->len;
1561         else if (size < skb->len)
1562                 msg->msg_flags |= MSG_TRUNC;
1563
1564         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1565         if (err)
1566                 goto out_free;
1567
1568         if (!siocb->scm) {
1569                 siocb->scm = &tmp_scm;
1570                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1571         }
1572         siocb->scm->creds = *UNIXCREDS(skb);
1573
1574         if (!(flags & MSG_PEEK))
1575         {
1576                 if (UNIXCB(skb).fp)
1577                         unix_detach_fds(siocb->scm, skb);
1578         }
1579         else 
1580         {
1581                 /* It is questionable: on PEEK we could:
1582                    - do not return fds - good, but too simple 8)
1583                    - return fds, and do not return them on read (old strategy,
1584                      apparently wrong)
1585                    - clone fds (I chose it for now, it is the most universal
1586                      solution)
1587                 
1588                    POSIX 1003.1g does not actually define this clearly
1589                    at all. POSIX 1003.1g doesn't define a lot of things
1590                    clearly however!                  
1591                    
1592                 */
1593                 if (UNIXCB(skb).fp)
1594                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1595         }
1596         err = size;
1597
1598         scm_recv(sock, msg, siocb->scm, flags);
1599
1600 out_free:
1601         skb_free_datagram(sk,skb);
1602 out_unlock:
1603         mutex_unlock(&u->readlock);
1604 out:
1605         return err;
1606 }
1607
1608 /*
1609  *      Sleep until data has arrive. But check for races..
1610  */
1611  
1612 static long unix_stream_data_wait(struct sock * sk, long timeo)
1613 {
1614         DEFINE_WAIT(wait);
1615
1616         unix_state_rlock(sk);
1617
1618         for (;;) {
1619                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1620
1621                 if (!skb_queue_empty(&sk->sk_receive_queue) ||
1622                     sk->sk_err ||
1623                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1624                     signal_pending(current) ||
1625                     !timeo)
1626                         break;
1627
1628                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1629                 unix_state_runlock(sk);
1630                 timeo = schedule_timeout(timeo);
1631                 unix_state_rlock(sk);
1632                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1633         }
1634
1635         finish_wait(sk->sk_sleep, &wait);
1636         unix_state_runlock(sk);
1637         return timeo;
1638 }
1639
1640
1641
1642 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1643                                struct msghdr *msg, size_t size,
1644                                int flags)
1645 {
1646         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1647         struct scm_cookie tmp_scm;
1648         struct sock *sk = sock->sk;
1649         struct unix_sock *u = unix_sk(sk);
1650         struct sockaddr_un *sunaddr=msg->msg_name;
1651         int copied = 0;
1652         int check_creds = 0;
1653         int target;
1654         int err = 0;
1655         long timeo;
1656
1657         err = -EINVAL;
1658         if (sk->sk_state != TCP_ESTABLISHED)
1659                 goto out;
1660
1661         err = -EOPNOTSUPP;
1662         if (flags&MSG_OOB)
1663                 goto out;
1664
1665         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1666         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1667
1668         msg->msg_namelen = 0;
1669
1670         /* Lock the socket to prevent queue disordering
1671          * while sleeps in memcpy_tomsg
1672          */
1673
1674         if (!siocb->scm) {
1675                 siocb->scm = &tmp_scm;
1676                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1677         }
1678
1679         mutex_lock(&u->readlock);
1680
1681         do
1682         {
1683                 int chunk;
1684                 struct sk_buff *skb;
1685
1686                 skb = skb_dequeue(&sk->sk_receive_queue);
1687                 if (skb==NULL)
1688                 {
1689                         if (copied >= target)
1690                                 break;
1691
1692                         /*
1693                          *      POSIX 1003.1g mandates this order.
1694                          */
1695                          
1696                         if ((err = sock_error(sk)) != 0)
1697                                 break;
1698                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1699                                 break;
1700                         err = -EAGAIN;
1701                         if (!timeo)
1702                                 break;
1703                         mutex_unlock(&u->readlock);
1704
1705                         timeo = unix_stream_data_wait(sk, timeo);
1706
1707                         if (signal_pending(current)) {
1708                                 err = sock_intr_errno(timeo);
1709                                 goto out;
1710                         }
1711                         mutex_lock(&u->readlock);
1712                         continue;
1713                 }
1714
1715                 if (check_creds) {
1716                         /* Never glue messages from different writers */
1717                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1718                                 skb_queue_head(&sk->sk_receive_queue, skb);
1719                                 break;
1720                         }
1721                 } else {
1722                         /* Copy credentials */
1723                         siocb->scm->creds = *UNIXCREDS(skb);
1724                         check_creds = 1;
1725                 }
1726
1727                 /* Copy address just once */
1728                 if (sunaddr)
1729                 {
1730                         unix_copy_addr(msg, skb->sk);
1731                         sunaddr = NULL;
1732                 }
1733
1734                 chunk = min_t(unsigned int, skb->len, size);
1735                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1736                         skb_queue_head(&sk->sk_receive_queue, skb);
1737                         if (copied == 0)
1738                                 copied = -EFAULT;
1739                         break;
1740                 }
1741                 copied += chunk;
1742                 size -= chunk;
1743
1744                 /* Mark read part of skb as used */
1745                 if (!(flags & MSG_PEEK))
1746                 {
1747                         skb_pull(skb, chunk);
1748
1749                         if (UNIXCB(skb).fp)
1750                                 unix_detach_fds(siocb->scm, skb);
1751
1752                         /* put the skb back if we didn't use it up.. */
1753                         if (skb->len)
1754                         {
1755                                 skb_queue_head(&sk->sk_receive_queue, skb);
1756                                 break;
1757                         }
1758
1759                         kfree_skb(skb);
1760
1761                         if (siocb->scm->fp)
1762                                 break;
1763                 }
1764                 else
1765                 {
1766                         /* It is questionable, see note in unix_dgram_recvmsg.
1767                          */
1768                         if (UNIXCB(skb).fp)
1769                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1770
1771                         /* put message back and return */
1772                         skb_queue_head(&sk->sk_receive_queue, skb);
1773                         break;
1774                 }
1775         } while (size);
1776
1777         mutex_unlock(&u->readlock);
1778         scm_recv(sock, msg, siocb->scm, flags);
1779 out:
1780         return copied ? : err;
1781 }
1782
1783 static int unix_shutdown(struct socket *sock, int mode)
1784 {
1785         struct sock *sk = sock->sk;
1786         struct sock *other;
1787
1788         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1789
1790         if (mode) {
1791                 unix_state_wlock(sk);
1792                 sk->sk_shutdown |= mode;
1793                 other=unix_peer(sk);
1794                 if (other)
1795                         sock_hold(other);
1796                 unix_state_wunlock(sk);
1797                 sk->sk_state_change(sk);
1798
1799                 if (other &&
1800                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1801
1802                         int peer_mode = 0;
1803
1804                         if (mode&RCV_SHUTDOWN)
1805                                 peer_mode |= SEND_SHUTDOWN;
1806                         if (mode&SEND_SHUTDOWN)
1807                                 peer_mode |= RCV_SHUTDOWN;
1808                         unix_state_wlock(other);
1809                         other->sk_shutdown |= peer_mode;
1810                         unix_state_wunlock(other);
1811                         other->sk_state_change(other);
1812                         read_lock(&other->sk_callback_lock);
1813                         if (peer_mode == SHUTDOWN_MASK)
1814                                 sk_wake_async(other,1,POLL_HUP);
1815                         else if (peer_mode & RCV_SHUTDOWN)
1816                                 sk_wake_async(other,1,POLL_IN);
1817                         read_unlock(&other->sk_callback_lock);
1818                 }
1819                 if (other)
1820                         sock_put(other);
1821         }
1822         return 0;
1823 }
1824
1825 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1826 {
1827         struct sock *sk = sock->sk;
1828         long amount=0;
1829         int err;
1830
1831         switch(cmd)
1832         {
1833                 case SIOCOUTQ:
1834                         amount = atomic_read(&sk->sk_wmem_alloc);
1835                         err = put_user(amount, (int __user *)arg);
1836                         break;
1837                 case SIOCINQ:
1838                 {
1839                         struct sk_buff *skb;
1840
1841                         if (sk->sk_state == TCP_LISTEN) {
1842                                 err = -EINVAL;
1843                                 break;
1844                         }
1845
1846                         spin_lock(&sk->sk_receive_queue.lock);
1847                         if (sk->sk_type == SOCK_STREAM ||
1848                             sk->sk_type == SOCK_SEQPACKET) {
1849                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1850                                         amount += skb->len;
1851                         } else {
1852                                 skb = skb_peek(&sk->sk_receive_queue);
1853                                 if (skb)
1854                                         amount=skb->len;
1855                         }
1856                         spin_unlock(&sk->sk_receive_queue.lock);
1857                         err = put_user(amount, (int __user *)arg);
1858                         break;
1859                 }
1860
1861                 default:
1862                         err = -ENOIOCTLCMD;
1863                         break;
1864         }
1865         return err;
1866 }
1867
1868 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1869 {
1870         struct sock *sk = sock->sk;
1871         unsigned int mask;
1872
1873         poll_wait(file, sk->sk_sleep, wait);
1874         mask = 0;
1875
1876         /* exceptional events? */
1877         if (sk->sk_err)
1878                 mask |= POLLERR;
1879         if (sk->sk_shutdown == SHUTDOWN_MASK)
1880                 mask |= POLLHUP;
1881
1882         /* readable? */
1883         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1884             (sk->sk_shutdown & RCV_SHUTDOWN))
1885                 mask |= POLLIN | POLLRDNORM;
1886
1887         /* Connection-based need to check for termination and startup */
1888         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1889                 mask |= POLLHUP;
1890
1891         /*
1892          * we set writable also when the other side has shut down the
1893          * connection. This prevents stuck sockets.
1894          */
1895         if (unix_writable(sk))
1896                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1897
1898         return mask;
1899 }
1900
1901
1902 #ifdef CONFIG_PROC_FS
1903 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1904 {
1905         loff_t off = 0;
1906         struct sock *s;
1907
1908         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1909                 if (off == pos) 
1910                         return s;
1911                 ++off;
1912         }
1913         return NULL;
1914 }
1915
1916
1917 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1918 {
1919         spin_lock(&unix_table_lock);
1920         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1921 }
1922
1923 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1924 {
1925         ++*pos;
1926
1927         if (v == (void *)1) 
1928                 return first_unix_socket(seq->private);
1929         return next_unix_socket(seq->private, v);
1930 }
1931
1932 static void unix_seq_stop(struct seq_file *seq, void *v)
1933 {
1934         spin_unlock(&unix_table_lock);
1935 }
1936
1937 static int unix_seq_show(struct seq_file *seq, void *v)
1938 {
1939         
1940         if (v == (void *)1)
1941                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1942                          "Inode Path\n");
1943         else {
1944                 struct sock *s = v;
1945                 struct unix_sock *u = unix_sk(s);
1946                 unix_state_rlock(s);
1947
1948                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1949                         s,
1950                         atomic_read(&s->sk_refcnt),
1951                         0,
1952                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1953                         s->sk_type,
1954                         s->sk_socket ?
1955                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1956                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1957                         sock_i_ino(s));
1958
1959                 if (u->addr) {
1960                         int i, len;
1961                         seq_putc(seq, ' ');
1962
1963                         i = 0;
1964                         len = u->addr->len - sizeof(short);
1965                         if (!UNIX_ABSTRACT(s))
1966                                 len--;
1967                         else {
1968                                 seq_putc(seq, '@');
1969                                 i++;
1970                         }
1971                         for ( ; i < len; i++)
1972                                 seq_putc(seq, u->addr->name->sun_path[i]);
1973                 }
1974                 unix_state_runlock(s);
1975                 seq_putc(seq, '\n');
1976         }
1977
1978         return 0;
1979 }
1980
1981 static struct seq_operations unix_seq_ops = {
1982         .start  = unix_seq_start,
1983         .next   = unix_seq_next,
1984         .stop   = unix_seq_stop,
1985         .show   = unix_seq_show,
1986 };
1987
1988
1989 static int unix_seq_open(struct inode *inode, struct file *file)
1990 {
1991         struct seq_file *seq;
1992         int rc = -ENOMEM;
1993         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
1994
1995         if (!iter)
1996                 goto out;
1997
1998         rc = seq_open(file, &unix_seq_ops);
1999         if (rc)
2000                 goto out_kfree;
2001
2002         seq          = file->private_data;
2003         seq->private = iter;
2004         *iter = 0;
2005 out:
2006         return rc;
2007 out_kfree:
2008         kfree(iter);
2009         goto out;
2010 }
2011
2012 static struct file_operations unix_seq_fops = {
2013         .owner          = THIS_MODULE,
2014         .open           = unix_seq_open,
2015         .read           = seq_read,
2016         .llseek         = seq_lseek,
2017         .release        = seq_release_private,
2018 };
2019
2020 #endif
2021
2022 static struct net_proto_family unix_family_ops = {
2023         .family = PF_UNIX,
2024         .create = unix_create,
2025         .owner  = THIS_MODULE,
2026 };
2027
2028 static int __init af_unix_init(void)
2029 {
2030         int rc = -1;
2031         struct sk_buff *dummy_skb;
2032
2033         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2034                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2035                 goto out;
2036         }
2037
2038         rc = proto_register(&unix_proto, 1);
2039         if (rc != 0) {
2040                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2041                        __FUNCTION__);
2042                 goto out;
2043         }
2044
2045         sock_register(&unix_family_ops);
2046 #ifdef CONFIG_PROC_FS
2047         proc_net_fops_create("unix", 0, &unix_seq_fops);
2048 #endif
2049         unix_sysctl_register();
2050 out:
2051         return rc;
2052 }
2053
2054 static void __exit af_unix_exit(void)
2055 {
2056         sock_unregister(PF_UNIX);
2057         unix_sysctl_unregister();
2058         proc_net_remove("unix");
2059         proto_unregister(&unix_proto);
2060 }
2061
2062 module_init(af_unix_init);
2063 module_exit(af_unix_exit);
2064
2065 MODULE_LICENSE("GPL");
2066 MODULE_ALIAS_NETPROTO(PF_UNIX);