Merge with master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6.git
[linux-2.6] / net / unix / af_unix.c
1 /*
2  * NET4:        Implementation of BSD Unix domain sockets.
3  *
4  * Authors:     Alan Cox, <alan.cox@linux.org>
5  *
6  *              This program is free software; you can redistribute it and/or
7  *              modify it under the terms of the GNU General Public License
8  *              as published by the Free Software Foundation; either version
9  *              2 of the License, or (at your option) any later version.
10  *
11  * Version:     $Id: af_unix.c,v 1.133 2002/02/08 03:57:19 davem Exp $
12  *
13  * Fixes:
14  *              Linus Torvalds  :       Assorted bug cures.
15  *              Niibe Yutaka    :       async I/O support.
16  *              Carsten Paeth   :       PF_UNIX check, address fixes.
17  *              Alan Cox        :       Limit size of allocated blocks.
18  *              Alan Cox        :       Fixed the stupid socketpair bug.
19  *              Alan Cox        :       BSD compatibility fine tuning.
20  *              Alan Cox        :       Fixed a bug in connect when interrupted.
21  *              Alan Cox        :       Sorted out a proper draft version of
22  *                                      file descriptor passing hacked up from
23  *                                      Mike Shaver's work.
24  *              Marty Leisner   :       Fixes to fd passing
25  *              Nick Nevin      :       recvmsg bugfix.
26  *              Alan Cox        :       Started proper garbage collector
27  *              Heiko EiBfeldt  :       Missing verify_area check
28  *              Alan Cox        :       Started POSIXisms
29  *              Andreas Schwab  :       Replace inode by dentry for proper
30  *                                      reference counting
31  *              Kirk Petersen   :       Made this a module
32  *          Christoph Rohland   :       Elegant non-blocking accept/connect algorithm.
33  *                                      Lots of bug fixes.
34  *           Alexey Kuznetosv   :       Repaired (I hope) bugs introduces
35  *                                      by above two patches.
36  *           Andrea Arcangeli   :       If possible we block in connect(2)
37  *                                      if the max backlog of the listen socket
38  *                                      is been reached. This won't break
39  *                                      old apps and it will avoid huge amount
40  *                                      of socks hashed (this for unix_gc()
41  *                                      performances reasons).
42  *                                      Security fix that limits the max
43  *                                      number of socks to 2*max_files and
44  *                                      the number of skb queueable in the
45  *                                      dgram receiver.
46  *              Artur Skawina   :       Hash function optimizations
47  *           Alexey Kuznetsov   :       Full scale SMP. Lot of bugs are introduced 8)
48  *            Malcolm Beattie   :       Set peercred for socketpair
49  *           Michal Ostrowski   :       Module initialization cleanup.
50  *           Arnaldo C. Melo    :       Remove MOD_{INC,DEC}_USE_COUNT,
51  *                                      the core infrastructure is doing that
52  *                                      for all net proto families now (2.5.69+)
53  *
54  *
55  * Known differences from reference BSD that was tested:
56  *
57  *      [TO FIX]
58  *      ECONNREFUSED is not returned from one end of a connected() socket to the
59  *              other the moment one end closes.
60  *      fstat() doesn't return st_dev=0, and give the blksize as high water mark
61  *              and a fake inode identifier (nor the BSD first socket fstat twice bug).
62  *      [NOT TO FIX]
63  *      accept() returns a path name even if the connecting socket has closed
64  *              in the meantime (BSD loses the path and gives up).
65  *      accept() returns 0 length path for an unbound connector. BSD returns 16
66  *              and a null first byte in the path (but not for gethost/peername - BSD bug ??)
67  *      socketpair(...SOCK_RAW..) doesn't panic the kernel.
68  *      BSD af_unix apparently has connect forgetting to block properly.
69  *              (need to check this with the POSIX spec in detail)
70  *
71  * Differences from 2.0.0-11-... (ANK)
72  *      Bug fixes and improvements.
73  *              - client shutdown killed server socket.
74  *              - removed all useless cli/sti pairs.
75  *
76  *      Semantic changes/extensions.
77  *              - generic control message passing.
78  *              - SCM_CREDENTIALS control message.
79  *              - "Abstract" (not FS based) socket bindings.
80  *                Abstract names are sequences of bytes (not zero terminated)
81  *                started by 0, so that this name space does not intersect
82  *                with BSD names.
83  */
84
85 #include <linux/module.h>
86 #include <linux/config.h>
87 #include <linux/kernel.h>
88 #include <linux/signal.h>
89 #include <linux/sched.h>
90 #include <linux/errno.h>
91 #include <linux/string.h>
92 #include <linux/stat.h>
93 #include <linux/dcache.h>
94 #include <linux/namei.h>
95 #include <linux/socket.h>
96 #include <linux/un.h>
97 #include <linux/fcntl.h>
98 #include <linux/termios.h>
99 #include <linux/sockios.h>
100 #include <linux/net.h>
101 #include <linux/in.h>
102 #include <linux/fs.h>
103 #include <linux/slab.h>
104 #include <asm/uaccess.h>
105 #include <linux/skbuff.h>
106 #include <linux/netdevice.h>
107 #include <net/sock.h>
108 #include <linux/tcp.h>
109 #include <net/af_unix.h>
110 #include <linux/proc_fs.h>
111 #include <linux/seq_file.h>
112 #include <net/scm.h>
113 #include <linux/init.h>
114 #include <linux/poll.h>
115 #include <linux/smp_lock.h>
116 #include <linux/rtnetlink.h>
117 #include <linux/mount.h>
118 #include <net/checksum.h>
119 #include <linux/security.h>
120
121 int sysctl_unix_max_dgram_qlen = 10;
122
123 struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1];
124 DEFINE_RWLOCK(unix_table_lock);
125 static atomic_t unix_nr_socks = ATOMIC_INIT(0);
126
127 #define unix_sockets_unbound    (&unix_socket_table[UNIX_HASH_SIZE])
128
129 #define UNIX_ABSTRACT(sk)       (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE)
130
131 /*
132  *  SMP locking strategy:
133  *    hash table is protected with rwlock unix_table_lock
134  *    each socket state is protected by separate rwlock.
135  */
136
137 static inline unsigned unix_hash_fold(unsigned hash)
138 {
139         hash ^= hash>>16;
140         hash ^= hash>>8;
141         return hash&(UNIX_HASH_SIZE-1);
142 }
143
144 #define unix_peer(sk) (unix_sk(sk)->peer)
145
146 static inline int unix_our_peer(struct sock *sk, struct sock *osk)
147 {
148         return unix_peer(osk) == sk;
149 }
150
151 static inline int unix_may_send(struct sock *sk, struct sock *osk)
152 {
153         return (unix_peer(osk) == NULL || unix_our_peer(sk, osk));
154 }
155
156 static struct sock *unix_peer_get(struct sock *s)
157 {
158         struct sock *peer;
159
160         unix_state_rlock(s);
161         peer = unix_peer(s);
162         if (peer)
163                 sock_hold(peer);
164         unix_state_runlock(s);
165         return peer;
166 }
167
168 static inline void unix_release_addr(struct unix_address *addr)
169 {
170         if (atomic_dec_and_test(&addr->refcnt))
171                 kfree(addr);
172 }
173
174 /*
175  *      Check unix socket name:
176  *              - should be not zero length.
177  *              - if started by not zero, should be NULL terminated (FS object)
178  *              - if started by zero, it is abstract name.
179  */
180  
181 static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp)
182 {
183         if (len <= sizeof(short) || len > sizeof(*sunaddr))
184                 return -EINVAL;
185         if (!sunaddr || sunaddr->sun_family != AF_UNIX)
186                 return -EINVAL;
187         if (sunaddr->sun_path[0]) {
188                 /*
189                  * This may look like an off by one error but it is a bit more
190                  * subtle. 108 is the longest valid AF_UNIX path for a binding.
191                  * sun_path[108] doesnt as such exist.  However in kernel space
192                  * we are guaranteed that it is a valid memory location in our
193                  * kernel address buffer.
194                  */
195                 ((char *)sunaddr)[len]=0;
196                 len = strlen(sunaddr->sun_path)+1+sizeof(short);
197                 return len;
198         }
199
200         *hashp = unix_hash_fold(csum_partial((char*)sunaddr, len, 0));
201         return len;
202 }
203
204 static void __unix_remove_socket(struct sock *sk)
205 {
206         sk_del_node_init(sk);
207 }
208
209 static void __unix_insert_socket(struct hlist_head *list, struct sock *sk)
210 {
211         BUG_TRAP(sk_unhashed(sk));
212         sk_add_node(sk, list);
213 }
214
215 static inline void unix_remove_socket(struct sock *sk)
216 {
217         write_lock(&unix_table_lock);
218         __unix_remove_socket(sk);
219         write_unlock(&unix_table_lock);
220 }
221
222 static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk)
223 {
224         write_lock(&unix_table_lock);
225         __unix_insert_socket(list, sk);
226         write_unlock(&unix_table_lock);
227 }
228
229 static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname,
230                                               int len, int type, unsigned hash)
231 {
232         struct sock *s;
233         struct hlist_node *node;
234
235         sk_for_each(s, node, &unix_socket_table[hash ^ type]) {
236                 struct unix_sock *u = unix_sk(s);
237
238                 if (u->addr->len == len &&
239                     !memcmp(u->addr->name, sunname, len))
240                         goto found;
241         }
242         s = NULL;
243 found:
244         return s;
245 }
246
247 static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname,
248                                                    int len, int type,
249                                                    unsigned hash)
250 {
251         struct sock *s;
252
253         read_lock(&unix_table_lock);
254         s = __unix_find_socket_byname(sunname, len, type, hash);
255         if (s)
256                 sock_hold(s);
257         read_unlock(&unix_table_lock);
258         return s;
259 }
260
261 static struct sock *unix_find_socket_byinode(struct inode *i)
262 {
263         struct sock *s;
264         struct hlist_node *node;
265
266         read_lock(&unix_table_lock);
267         sk_for_each(s, node,
268                     &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) {
269                 struct dentry *dentry = unix_sk(s)->dentry;
270
271                 if(dentry && dentry->d_inode == i)
272                 {
273                         sock_hold(s);
274                         goto found;
275                 }
276         }
277         s = NULL;
278 found:
279         read_unlock(&unix_table_lock);
280         return s;
281 }
282
283 static inline int unix_writable(struct sock *sk)
284 {
285         return (atomic_read(&sk->sk_wmem_alloc) << 2) <= sk->sk_sndbuf;
286 }
287
288 static void unix_write_space(struct sock *sk)
289 {
290         read_lock(&sk->sk_callback_lock);
291         if (unix_writable(sk)) {
292                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
293                         wake_up_interruptible(sk->sk_sleep);
294                 sk_wake_async(sk, 2, POLL_OUT);
295         }
296         read_unlock(&sk->sk_callback_lock);
297 }
298
299 /* When dgram socket disconnects (or changes its peer), we clear its receive
300  * queue of packets arrived from previous peer. First, it allows to do
301  * flow control based only on wmem_alloc; second, sk connected to peer
302  * may receive messages only from that peer. */
303 static void unix_dgram_disconnected(struct sock *sk, struct sock *other)
304 {
305         if (skb_queue_len(&sk->sk_receive_queue)) {
306                 skb_queue_purge(&sk->sk_receive_queue);
307                 wake_up_interruptible_all(&unix_sk(sk)->peer_wait);
308
309                 /* If one link of bidirectional dgram pipe is disconnected,
310                  * we signal error. Messages are lost. Do not make this,
311                  * when peer was not connected to us.
312                  */
313                 if (!sock_flag(other, SOCK_DEAD) && unix_peer(other) == sk) {
314                         other->sk_err = ECONNRESET;
315                         other->sk_error_report(other);
316                 }
317         }
318 }
319
320 static void unix_sock_destructor(struct sock *sk)
321 {
322         struct unix_sock *u = unix_sk(sk);
323
324         skb_queue_purge(&sk->sk_receive_queue);
325
326         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
327         BUG_TRAP(sk_unhashed(sk));
328         BUG_TRAP(!sk->sk_socket);
329         if (!sock_flag(sk, SOCK_DEAD)) {
330                 printk("Attempt to release alive unix socket: %p\n", sk);
331                 return;
332         }
333
334         if (u->addr)
335                 unix_release_addr(u->addr);
336
337         atomic_dec(&unix_nr_socks);
338 #ifdef UNIX_REFCNT_DEBUG
339         printk(KERN_DEBUG "UNIX %p is destroyed, %d are still alive.\n", sk, atomic_read(&unix_nr_socks));
340 #endif
341 }
342
343 static int unix_release_sock (struct sock *sk, int embrion)
344 {
345         struct unix_sock *u = unix_sk(sk);
346         struct dentry *dentry;
347         struct vfsmount *mnt;
348         struct sock *skpair;
349         struct sk_buff *skb;
350         int state;
351
352         unix_remove_socket(sk);
353
354         /* Clear state */
355         unix_state_wlock(sk);
356         sock_orphan(sk);
357         sk->sk_shutdown = SHUTDOWN_MASK;
358         dentry       = u->dentry;
359         u->dentry    = NULL;
360         mnt          = u->mnt;
361         u->mnt       = NULL;
362         state = sk->sk_state;
363         sk->sk_state = TCP_CLOSE;
364         unix_state_wunlock(sk);
365
366         wake_up_interruptible_all(&u->peer_wait);
367
368         skpair=unix_peer(sk);
369
370         if (skpair!=NULL) {
371                 if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) {
372                         unix_state_wlock(skpair);
373                         /* No more writes */
374                         skpair->sk_shutdown = SHUTDOWN_MASK;
375                         if (!skb_queue_empty(&sk->sk_receive_queue) || embrion)
376                                 skpair->sk_err = ECONNRESET;
377                         unix_state_wunlock(skpair);
378                         skpair->sk_state_change(skpair);
379                         read_lock(&skpair->sk_callback_lock);
380                         sk_wake_async(skpair,1,POLL_HUP);
381                         read_unlock(&skpair->sk_callback_lock);
382                 }
383                 sock_put(skpair); /* It may now die */
384                 unix_peer(sk) = NULL;
385         }
386
387         /* Try to flush out this socket. Throw out buffers at least */
388
389         while ((skb = skb_dequeue(&sk->sk_receive_queue)) != NULL) {
390                 if (state==TCP_LISTEN)
391                         unix_release_sock(skb->sk, 1);
392                 /* passed fds are erased in the kfree_skb hook        */
393                 kfree_skb(skb);
394         }
395
396         if (dentry) {
397                 dput(dentry);
398                 mntput(mnt);
399         }
400
401         sock_put(sk);
402
403         /* ---- Socket is dead now and most probably destroyed ---- */
404
405         /*
406          * Fixme: BSD difference: In BSD all sockets connected to use get
407          *        ECONNRESET and we die on the spot. In Linux we behave
408          *        like files and pipes do and wait for the last
409          *        dereference.
410          *
411          * Can't we simply set sock->err?
412          *
413          *        What the above comment does talk about? --ANK(980817)
414          */
415
416         if (atomic_read(&unix_tot_inflight))
417                 unix_gc();              /* Garbage collect fds */       
418
419         return 0;
420 }
421
422 static int unix_listen(struct socket *sock, int backlog)
423 {
424         int err;
425         struct sock *sk = sock->sk;
426         struct unix_sock *u = unix_sk(sk);
427
428         err = -EOPNOTSUPP;
429         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
430                 goto out;                       /* Only stream/seqpacket sockets accept */
431         err = -EINVAL;
432         if (!u->addr)
433                 goto out;                       /* No listens on an unbound socket */
434         unix_state_wlock(sk);
435         if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN)
436                 goto out_unlock;
437         if (backlog > sk->sk_max_ack_backlog)
438                 wake_up_interruptible_all(&u->peer_wait);
439         sk->sk_max_ack_backlog  = backlog;
440         sk->sk_state            = TCP_LISTEN;
441         /* set credentials so connect can copy them */
442         sk->sk_peercred.pid     = current->tgid;
443         sk->sk_peercred.uid     = current->euid;
444         sk->sk_peercred.gid     = current->egid;
445         err = 0;
446
447 out_unlock:
448         unix_state_wunlock(sk);
449 out:
450         return err;
451 }
452
453 static int unix_release(struct socket *);
454 static int unix_bind(struct socket *, struct sockaddr *, int);
455 static int unix_stream_connect(struct socket *, struct sockaddr *,
456                                int addr_len, int flags);
457 static int unix_socketpair(struct socket *, struct socket *);
458 static int unix_accept(struct socket *, struct socket *, int);
459 static int unix_getname(struct socket *, struct sockaddr *, int *, int);
460 static unsigned int unix_poll(struct file *, struct socket *, poll_table *);
461 static int unix_ioctl(struct socket *, unsigned int, unsigned long);
462 static int unix_shutdown(struct socket *, int);
463 static int unix_stream_sendmsg(struct kiocb *, struct socket *,
464                                struct msghdr *, size_t);
465 static int unix_stream_recvmsg(struct kiocb *, struct socket *,
466                                struct msghdr *, size_t, int);
467 static int unix_dgram_sendmsg(struct kiocb *, struct socket *,
468                               struct msghdr *, size_t);
469 static int unix_dgram_recvmsg(struct kiocb *, struct socket *,
470                               struct msghdr *, size_t, int);
471 static int unix_dgram_connect(struct socket *, struct sockaddr *,
472                               int, int);
473 static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *,
474                                   struct msghdr *, size_t);
475
476 static struct proto_ops unix_stream_ops = {
477         .family =       PF_UNIX,
478         .owner =        THIS_MODULE,
479         .release =      unix_release,
480         .bind =         unix_bind,
481         .connect =      unix_stream_connect,
482         .socketpair =   unix_socketpair,
483         .accept =       unix_accept,
484         .getname =      unix_getname,
485         .poll =         unix_poll,
486         .ioctl =        unix_ioctl,
487         .listen =       unix_listen,
488         .shutdown =     unix_shutdown,
489         .setsockopt =   sock_no_setsockopt,
490         .getsockopt =   sock_no_getsockopt,
491         .sendmsg =      unix_stream_sendmsg,
492         .recvmsg =      unix_stream_recvmsg,
493         .mmap =         sock_no_mmap,
494         .sendpage =     sock_no_sendpage,
495 };
496
497 static struct proto_ops unix_dgram_ops = {
498         .family =       PF_UNIX,
499         .owner =        THIS_MODULE,
500         .release =      unix_release,
501         .bind =         unix_bind,
502         .connect =      unix_dgram_connect,
503         .socketpair =   unix_socketpair,
504         .accept =       sock_no_accept,
505         .getname =      unix_getname,
506         .poll =         datagram_poll,
507         .ioctl =        unix_ioctl,
508         .listen =       sock_no_listen,
509         .shutdown =     unix_shutdown,
510         .setsockopt =   sock_no_setsockopt,
511         .getsockopt =   sock_no_getsockopt,
512         .sendmsg =      unix_dgram_sendmsg,
513         .recvmsg =      unix_dgram_recvmsg,
514         .mmap =         sock_no_mmap,
515         .sendpage =     sock_no_sendpage,
516 };
517
518 static struct proto_ops unix_seqpacket_ops = {
519         .family =       PF_UNIX,
520         .owner =        THIS_MODULE,
521         .release =      unix_release,
522         .bind =         unix_bind,
523         .connect =      unix_stream_connect,
524         .socketpair =   unix_socketpair,
525         .accept =       unix_accept,
526         .getname =      unix_getname,
527         .poll =         datagram_poll,
528         .ioctl =        unix_ioctl,
529         .listen =       unix_listen,
530         .shutdown =     unix_shutdown,
531         .setsockopt =   sock_no_setsockopt,
532         .getsockopt =   sock_no_getsockopt,
533         .sendmsg =      unix_seqpacket_sendmsg,
534         .recvmsg =      unix_dgram_recvmsg,
535         .mmap =         sock_no_mmap,
536         .sendpage =     sock_no_sendpage,
537 };
538
539 static struct proto unix_proto = {
540         .name     = "UNIX",
541         .owner    = THIS_MODULE,
542         .obj_size = sizeof(struct unix_sock),
543 };
544
545 static struct sock * unix_create1(struct socket *sock)
546 {
547         struct sock *sk = NULL;
548         struct unix_sock *u;
549
550         if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files)
551                 goto out;
552
553         sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1);
554         if (!sk)
555                 goto out;
556
557         atomic_inc(&unix_nr_socks);
558
559         sock_init_data(sock,sk);
560
561         sk->sk_write_space      = unix_write_space;
562         sk->sk_max_ack_backlog  = sysctl_unix_max_dgram_qlen;
563         sk->sk_destruct         = unix_sock_destructor;
564         u         = unix_sk(sk);
565         u->dentry = NULL;
566         u->mnt    = NULL;
567         rwlock_init(&u->lock);
568         atomic_set(&u->inflight, sock ? 0 : -1);
569         init_MUTEX(&u->readsem); /* single task reading lock */
570         init_waitqueue_head(&u->peer_wait);
571         unix_insert_socket(unix_sockets_unbound, sk);
572 out:
573         return sk;
574 }
575
576 static int unix_create(struct socket *sock, int protocol)
577 {
578         if (protocol && protocol != PF_UNIX)
579                 return -EPROTONOSUPPORT;
580
581         sock->state = SS_UNCONNECTED;
582
583         switch (sock->type) {
584         case SOCK_STREAM:
585                 sock->ops = &unix_stream_ops;
586                 break;
587                 /*
588                  *      Believe it or not BSD has AF_UNIX, SOCK_RAW though
589                  *      nothing uses it.
590                  */
591         case SOCK_RAW:
592                 sock->type=SOCK_DGRAM;
593         case SOCK_DGRAM:
594                 sock->ops = &unix_dgram_ops;
595                 break;
596         case SOCK_SEQPACKET:
597                 sock->ops = &unix_seqpacket_ops;
598                 break;
599         default:
600                 return -ESOCKTNOSUPPORT;
601         }
602
603         return unix_create1(sock) ? 0 : -ENOMEM;
604 }
605
606 static int unix_release(struct socket *sock)
607 {
608         struct sock *sk = sock->sk;
609
610         if (!sk)
611                 return 0;
612
613         sock->sk = NULL;
614
615         return unix_release_sock (sk, 0);
616 }
617
618 static int unix_autobind(struct socket *sock)
619 {
620         struct sock *sk = sock->sk;
621         struct unix_sock *u = unix_sk(sk);
622         static u32 ordernum = 1;
623         struct unix_address * addr;
624         int err;
625
626         down(&u->readsem);
627
628         err = 0;
629         if (u->addr)
630                 goto out;
631
632         err = -ENOMEM;
633         addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL);
634         if (!addr)
635                 goto out;
636
637         memset(addr, 0, sizeof(*addr) + sizeof(short) + 16);
638         addr->name->sun_family = AF_UNIX;
639         atomic_set(&addr->refcnt, 1);
640
641 retry:
642         addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short);
643         addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0));
644
645         write_lock(&unix_table_lock);
646         ordernum = (ordernum+1)&0xFFFFF;
647
648         if (__unix_find_socket_byname(addr->name, addr->len, sock->type,
649                                       addr->hash)) {
650                 write_unlock(&unix_table_lock);
651                 /* Sanity yield. It is unusual case, but yet... */
652                 if (!(ordernum&0xFF))
653                         yield();
654                 goto retry;
655         }
656         addr->hash ^= sk->sk_type;
657
658         __unix_remove_socket(sk);
659         u->addr = addr;
660         __unix_insert_socket(&unix_socket_table[addr->hash], sk);
661         write_unlock(&unix_table_lock);
662         err = 0;
663
664 out:    up(&u->readsem);
665         return err;
666 }
667
668 static struct sock *unix_find_other(struct sockaddr_un *sunname, int len,
669                                     int type, unsigned hash, int *error)
670 {
671         struct sock *u;
672         struct nameidata nd;
673         int err = 0;
674         
675         if (sunname->sun_path[0]) {
676                 err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd);
677                 if (err)
678                         goto fail;
679                 err = permission(nd.dentry->d_inode,MAY_WRITE, &nd);
680                 if (err)
681                         goto put_fail;
682
683                 err = -ECONNREFUSED;
684                 if (!S_ISSOCK(nd.dentry->d_inode->i_mode))
685                         goto put_fail;
686                 u=unix_find_socket_byinode(nd.dentry->d_inode);
687                 if (!u)
688                         goto put_fail;
689
690                 if (u->sk_type == type)
691                         touch_atime(nd.mnt, nd.dentry);
692
693                 path_release(&nd);
694
695                 err=-EPROTOTYPE;
696                 if (u->sk_type != type) {
697                         sock_put(u);
698                         goto fail;
699                 }
700         } else {
701                 err = -ECONNREFUSED;
702                 u=unix_find_socket_byname(sunname, len, type, hash);
703                 if (u) {
704                         struct dentry *dentry;
705                         dentry = unix_sk(u)->dentry;
706                         if (dentry)
707                                 touch_atime(unix_sk(u)->mnt, dentry);
708                 } else
709                         goto fail;
710         }
711         return u;
712
713 put_fail:
714         path_release(&nd);
715 fail:
716         *error=err;
717         return NULL;
718 }
719
720
721 static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
722 {
723         struct sock *sk = sock->sk;
724         struct unix_sock *u = unix_sk(sk);
725         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
726         struct dentry * dentry = NULL;
727         struct nameidata nd;
728         int err;
729         unsigned hash;
730         struct unix_address *addr;
731         struct hlist_head *list;
732
733         err = -EINVAL;
734         if (sunaddr->sun_family != AF_UNIX)
735                 goto out;
736
737         if (addr_len==sizeof(short)) {
738                 err = unix_autobind(sock);
739                 goto out;
740         }
741
742         err = unix_mkname(sunaddr, addr_len, &hash);
743         if (err < 0)
744                 goto out;
745         addr_len = err;
746
747         down(&u->readsem);
748
749         err = -EINVAL;
750         if (u->addr)
751                 goto out_up;
752
753         err = -ENOMEM;
754         addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
755         if (!addr)
756                 goto out_up;
757
758         memcpy(addr->name, sunaddr, addr_len);
759         addr->len = addr_len;
760         addr->hash = hash ^ sk->sk_type;
761         atomic_set(&addr->refcnt, 1);
762
763         if (sunaddr->sun_path[0]) {
764                 unsigned int mode;
765                 err = 0;
766                 /*
767                  * Get the parent directory, calculate the hash for last
768                  * component.
769                  */
770                 err = path_lookup(sunaddr->sun_path, LOOKUP_PARENT, &nd);
771                 if (err)
772                         goto out_mknod_parent;
773                 /*
774                  * Yucky last component or no last component at all?
775                  * (foo/., foo/.., /////)
776                  */
777                 err = -EEXIST;
778                 if (nd.last_type != LAST_NORM)
779                         goto out_mknod;
780                 /*
781                  * Lock the directory.
782                  */
783                 down(&nd.dentry->d_inode->i_sem);
784                 /*
785                  * Do the final lookup.
786                  */
787                 dentry = lookup_hash(&nd.last, nd.dentry);
788                 err = PTR_ERR(dentry);
789                 if (IS_ERR(dentry))
790                         goto out_mknod_unlock;
791                 err = -ENOENT;
792                 /*
793                  * Special case - lookup gave negative, but... we had foo/bar/
794                  * From the vfs_mknod() POV we just have a negative dentry -
795                  * all is fine. Let's be bastards - you had / on the end, you've
796                  * been asking for (non-existent) directory. -ENOENT for you.
797                  */
798                 if (nd.last.name[nd.last.len] && !dentry->d_inode)
799                         goto out_mknod_dput;
800                 /*
801                  * All right, let's create it.
802                  */
803                 mode = S_IFSOCK |
804                        (SOCK_INODE(sock)->i_mode & ~current->fs->umask);
805                 err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0);
806                 if (err)
807                         goto out_mknod_dput;
808                 up(&nd.dentry->d_inode->i_sem);
809                 dput(nd.dentry);
810                 nd.dentry = dentry;
811
812                 addr->hash = UNIX_HASH_SIZE;
813         }
814
815         write_lock(&unix_table_lock);
816
817         if (!sunaddr->sun_path[0]) {
818                 err = -EADDRINUSE;
819                 if (__unix_find_socket_byname(sunaddr, addr_len,
820                                               sk->sk_type, hash)) {
821                         unix_release_addr(addr);
822                         goto out_unlock;
823                 }
824
825                 list = &unix_socket_table[addr->hash];
826         } else {
827                 list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)];
828                 u->dentry = nd.dentry;
829                 u->mnt    = nd.mnt;
830         }
831
832         err = 0;
833         __unix_remove_socket(sk);
834         u->addr = addr;
835         __unix_insert_socket(list, sk);
836
837 out_unlock:
838         write_unlock(&unix_table_lock);
839 out_up:
840         up(&u->readsem);
841 out:
842         return err;
843
844 out_mknod_dput:
845         dput(dentry);
846 out_mknod_unlock:
847         up(&nd.dentry->d_inode->i_sem);
848 out_mknod:
849         path_release(&nd);
850 out_mknod_parent:
851         if (err==-EEXIST)
852                 err=-EADDRINUSE;
853         unix_release_addr(addr);
854         goto out_up;
855 }
856
857 static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr,
858                               int alen, int flags)
859 {
860         struct sock *sk = sock->sk;
861         struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr;
862         struct sock *other;
863         unsigned hash;
864         int err;
865
866         if (addr->sa_family != AF_UNSPEC) {
867                 err = unix_mkname(sunaddr, alen, &hash);
868                 if (err < 0)
869                         goto out;
870                 alen = err;
871
872                 if (test_bit(SOCK_PASSCRED, &sock->flags) &&
873                     !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0)
874                         goto out;
875
876                 other=unix_find_other(sunaddr, alen, sock->type, hash, &err);
877                 if (!other)
878                         goto out;
879
880                 unix_state_wlock(sk);
881
882                 err = -EPERM;
883                 if (!unix_may_send(sk, other))
884                         goto out_unlock;
885
886                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
887                 if (err)
888                         goto out_unlock;
889
890         } else {
891                 /*
892                  *      1003.1g breaking connected state with AF_UNSPEC
893                  */
894                 other = NULL;
895                 unix_state_wlock(sk);
896         }
897
898         /*
899          * If it was connected, reconnect.
900          */
901         if (unix_peer(sk)) {
902                 struct sock *old_peer = unix_peer(sk);
903                 unix_peer(sk)=other;
904                 unix_state_wunlock(sk);
905
906                 if (other != old_peer)
907                         unix_dgram_disconnected(sk, old_peer);
908                 sock_put(old_peer);
909         } else {
910                 unix_peer(sk)=other;
911                 unix_state_wunlock(sk);
912         }
913         return 0;
914
915 out_unlock:
916         unix_state_wunlock(sk);
917         sock_put(other);
918 out:
919         return err;
920 }
921
922 static long unix_wait_for_peer(struct sock *other, long timeo)
923 {
924         struct unix_sock *u = unix_sk(other);
925         int sched;
926         DEFINE_WAIT(wait);
927
928         prepare_to_wait_exclusive(&u->peer_wait, &wait, TASK_INTERRUPTIBLE);
929
930         sched = !sock_flag(other, SOCK_DEAD) &&
931                 !(other->sk_shutdown & RCV_SHUTDOWN) &&
932                 (skb_queue_len(&other->sk_receive_queue) >
933                  other->sk_max_ack_backlog);
934
935         unix_state_runlock(other);
936
937         if (sched)
938                 timeo = schedule_timeout(timeo);
939
940         finish_wait(&u->peer_wait, &wait);
941         return timeo;
942 }
943
944 static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
945                                int addr_len, int flags)
946 {
947         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
948         struct sock *sk = sock->sk;
949         struct unix_sock *u = unix_sk(sk), *newu, *otheru;
950         struct sock *newsk = NULL;
951         struct sock *other = NULL;
952         struct sk_buff *skb = NULL;
953         unsigned hash;
954         int st;
955         int err;
956         long timeo;
957
958         err = unix_mkname(sunaddr, addr_len, &hash);
959         if (err < 0)
960                 goto out;
961         addr_len = err;
962
963         if (test_bit(SOCK_PASSCRED, &sock->flags)
964                 && !u->addr && (err = unix_autobind(sock)) != 0)
965                 goto out;
966
967         timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);
968
969         /* First of all allocate resources.
970            If we will make it after state is locked,
971            we will have to recheck all again in any case.
972          */
973
974         err = -ENOMEM;
975
976         /* create new sock for complete connection */
977         newsk = unix_create1(NULL);
978         if (newsk == NULL)
979                 goto out;
980
981         /* Allocate skb for sending to listening sock */
982         skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL);
983         if (skb == NULL)
984                 goto out;
985
986 restart:
987         /*  Find listening sock. */
988         other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err);
989         if (!other)
990                 goto out;
991
992         /* Latch state of peer */
993         unix_state_rlock(other);
994
995         /* Apparently VFS overslept socket death. Retry. */
996         if (sock_flag(other, SOCK_DEAD)) {
997                 unix_state_runlock(other);
998                 sock_put(other);
999                 goto restart;
1000         }
1001
1002         err = -ECONNREFUSED;
1003         if (other->sk_state != TCP_LISTEN)
1004                 goto out_unlock;
1005
1006         if (skb_queue_len(&other->sk_receive_queue) >
1007             other->sk_max_ack_backlog) {
1008                 err = -EAGAIN;
1009                 if (!timeo)
1010                         goto out_unlock;
1011
1012                 timeo = unix_wait_for_peer(other, timeo);
1013
1014                 err = sock_intr_errno(timeo);
1015                 if (signal_pending(current))
1016                         goto out;
1017                 sock_put(other);
1018                 goto restart;
1019         }
1020
1021         /* Latch our state.
1022
1023            It is tricky place. We need to grab write lock and cannot
1024            drop lock on peer. It is dangerous because deadlock is
1025            possible. Connect to self case and simultaneous
1026            attempt to connect are eliminated by checking socket
1027            state. other is TCP_LISTEN, if sk is TCP_LISTEN we
1028            check this before attempt to grab lock.
1029
1030            Well, and we have to recheck the state after socket locked.
1031          */
1032         st = sk->sk_state;
1033
1034         switch (st) {
1035         case TCP_CLOSE:
1036                 /* This is ok... continue with connect */
1037                 break;
1038         case TCP_ESTABLISHED:
1039                 /* Socket is already connected */
1040                 err = -EISCONN;
1041                 goto out_unlock;
1042         default:
1043                 err = -EINVAL;
1044                 goto out_unlock;
1045         }
1046
1047         unix_state_wlock(sk);
1048
1049         if (sk->sk_state != st) {
1050                 unix_state_wunlock(sk);
1051                 unix_state_runlock(other);
1052                 sock_put(other);
1053                 goto restart;
1054         }
1055
1056         err = security_unix_stream_connect(sock, other->sk_socket, newsk);
1057         if (err) {
1058                 unix_state_wunlock(sk);
1059                 goto out_unlock;
1060         }
1061
1062         /* The way is open! Fastly set all the necessary fields... */
1063
1064         sock_hold(sk);
1065         unix_peer(newsk)        = sk;
1066         newsk->sk_state         = TCP_ESTABLISHED;
1067         newsk->sk_type          = sk->sk_type;
1068         newsk->sk_peercred.pid  = current->tgid;
1069         newsk->sk_peercred.uid  = current->euid;
1070         newsk->sk_peercred.gid  = current->egid;
1071         newu = unix_sk(newsk);
1072         newsk->sk_sleep         = &newu->peer_wait;
1073         otheru = unix_sk(other);
1074
1075         /* copy address information from listening to new sock*/
1076         if (otheru->addr) {
1077                 atomic_inc(&otheru->addr->refcnt);
1078                 newu->addr = otheru->addr;
1079         }
1080         if (otheru->dentry) {
1081                 newu->dentry    = dget(otheru->dentry);
1082                 newu->mnt       = mntget(otheru->mnt);
1083         }
1084
1085         /* Set credentials */
1086         sk->sk_peercred = other->sk_peercred;
1087
1088         sock_hold(newsk);
1089         unix_peer(sk)   = newsk;
1090         sock->state     = SS_CONNECTED;
1091         sk->sk_state    = TCP_ESTABLISHED;
1092
1093         unix_state_wunlock(sk);
1094
1095         /* take ten and and send info to listening sock */
1096         spin_lock(&other->sk_receive_queue.lock);
1097         __skb_queue_tail(&other->sk_receive_queue, skb);
1098         /* Undo artificially decreased inflight after embrion
1099          * is installed to listening socket. */
1100         atomic_inc(&newu->inflight);
1101         spin_unlock(&other->sk_receive_queue.lock);
1102         unix_state_runlock(other);
1103         other->sk_data_ready(other, 0);
1104         sock_put(other);
1105         return 0;
1106
1107 out_unlock:
1108         if (other)
1109                 unix_state_runlock(other);
1110
1111 out:
1112         if (skb)
1113                 kfree_skb(skb);
1114         if (newsk)
1115                 unix_release_sock(newsk, 0);
1116         if (other)
1117                 sock_put(other);
1118         return err;
1119 }
1120
1121 static int unix_socketpair(struct socket *socka, struct socket *sockb)
1122 {
1123         struct sock *ska=socka->sk, *skb = sockb->sk;
1124
1125         /* Join our sockets back to back */
1126         sock_hold(ska);
1127         sock_hold(skb);
1128         unix_peer(ska)=skb;
1129         unix_peer(skb)=ska;
1130         ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid;
1131         ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid;
1132         ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid;
1133
1134         if (ska->sk_type != SOCK_DGRAM) {
1135                 ska->sk_state = TCP_ESTABLISHED;
1136                 skb->sk_state = TCP_ESTABLISHED;
1137                 socka->state  = SS_CONNECTED;
1138                 sockb->state  = SS_CONNECTED;
1139         }
1140         return 0;
1141 }
1142
1143 static int unix_accept(struct socket *sock, struct socket *newsock, int flags)
1144 {
1145         struct sock *sk = sock->sk;
1146         struct sock *tsk;
1147         struct sk_buff *skb;
1148         int err;
1149
1150         err = -EOPNOTSUPP;
1151         if (sock->type!=SOCK_STREAM && sock->type!=SOCK_SEQPACKET)
1152                 goto out;
1153
1154         err = -EINVAL;
1155         if (sk->sk_state != TCP_LISTEN)
1156                 goto out;
1157
1158         /* If socket state is TCP_LISTEN it cannot change (for now...),
1159          * so that no locks are necessary.
1160          */
1161
1162         skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err);
1163         if (!skb) {
1164                 /* This means receive shutdown. */
1165                 if (err == 0)
1166                         err = -EINVAL;
1167                 goto out;
1168         }
1169
1170         tsk = skb->sk;
1171         skb_free_datagram(sk, skb);
1172         wake_up_interruptible(&unix_sk(sk)->peer_wait);
1173
1174         /* attach accepted sock to socket */
1175         unix_state_wlock(tsk);
1176         newsock->state = SS_CONNECTED;
1177         sock_graft(tsk, newsock);
1178         unix_state_wunlock(tsk);
1179         return 0;
1180
1181 out:
1182         return err;
1183 }
1184
1185
1186 static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_len, int peer)
1187 {
1188         struct sock *sk = sock->sk;
1189         struct unix_sock *u;
1190         struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr;
1191         int err = 0;
1192
1193         if (peer) {
1194                 sk = unix_peer_get(sk);
1195
1196                 err = -ENOTCONN;
1197                 if (!sk)
1198                         goto out;
1199                 err = 0;
1200         } else {
1201                 sock_hold(sk);
1202         }
1203
1204         u = unix_sk(sk);
1205         unix_state_rlock(sk);
1206         if (!u->addr) {
1207                 sunaddr->sun_family = AF_UNIX;
1208                 sunaddr->sun_path[0] = 0;
1209                 *uaddr_len = sizeof(short);
1210         } else {
1211                 struct unix_address *addr = u->addr;
1212
1213                 *uaddr_len = addr->len;
1214                 memcpy(sunaddr, addr->name, *uaddr_len);
1215         }
1216         unix_state_runlock(sk);
1217         sock_put(sk);
1218 out:
1219         return err;
1220 }
1221
1222 static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1223 {
1224         int i;
1225
1226         scm->fp = UNIXCB(skb).fp;
1227         skb->destructor = sock_wfree;
1228         UNIXCB(skb).fp = NULL;
1229
1230         for (i=scm->fp->count-1; i>=0; i--)
1231                 unix_notinflight(scm->fp->fp[i]);
1232 }
1233
1234 static void unix_destruct_fds(struct sk_buff *skb)
1235 {
1236         struct scm_cookie scm;
1237         memset(&scm, 0, sizeof(scm));
1238         unix_detach_fds(&scm, skb);
1239
1240         /* Alas, it calls VFS */
1241         /* So fscking what? fput() had been SMP-safe since the last Summer */
1242         scm_destroy(&scm);
1243         sock_wfree(skb);
1244 }
1245
1246 static void unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
1247 {
1248         int i;
1249         for (i=scm->fp->count-1; i>=0; i--)
1250                 unix_inflight(scm->fp->fp[i]);
1251         UNIXCB(skb).fp = scm->fp;
1252         skb->destructor = unix_destruct_fds;
1253         scm->fp = NULL;
1254 }
1255
1256 /*
1257  *      Send AF_UNIX data.
1258  */
1259
1260 static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock,
1261                               struct msghdr *msg, size_t len)
1262 {
1263         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1264         struct sock *sk = sock->sk;
1265         struct unix_sock *u = unix_sk(sk);
1266         struct sockaddr_un *sunaddr=msg->msg_name;
1267         struct sock *other = NULL;
1268         int namelen = 0; /* fake GCC */
1269         int err;
1270         unsigned hash;
1271         struct sk_buff *skb;
1272         long timeo;
1273         struct scm_cookie tmp_scm;
1274
1275         if (NULL == siocb->scm)
1276                 siocb->scm = &tmp_scm;
1277         err = scm_send(sock, msg, siocb->scm);
1278         if (err < 0)
1279                 return err;
1280
1281         err = -EOPNOTSUPP;
1282         if (msg->msg_flags&MSG_OOB)
1283                 goto out;
1284
1285         if (msg->msg_namelen) {
1286                 err = unix_mkname(sunaddr, msg->msg_namelen, &hash);
1287                 if (err < 0)
1288                         goto out;
1289                 namelen = err;
1290         } else {
1291                 sunaddr = NULL;
1292                 err = -ENOTCONN;
1293                 other = unix_peer_get(sk);
1294                 if (!other)
1295                         goto out;
1296         }
1297
1298         if (test_bit(SOCK_PASSCRED, &sock->flags)
1299                 && !u->addr && (err = unix_autobind(sock)) != 0)
1300                 goto out;
1301
1302         err = -EMSGSIZE;
1303         if (len > sk->sk_sndbuf - 32)
1304                 goto out;
1305
1306         skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
1307         if (skb==NULL)
1308                 goto out;
1309
1310         memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1311         if (siocb->scm->fp)
1312                 unix_attach_fds(siocb->scm, skb);
1313
1314         skb->h.raw = skb->data;
1315         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
1316         if (err)
1317                 goto out_free;
1318
1319         timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
1320
1321 restart:
1322         if (!other) {
1323                 err = -ECONNRESET;
1324                 if (sunaddr == NULL)
1325                         goto out_free;
1326
1327                 other = unix_find_other(sunaddr, namelen, sk->sk_type,
1328                                         hash, &err);
1329                 if (other==NULL)
1330                         goto out_free;
1331         }
1332
1333         unix_state_rlock(other);
1334         err = -EPERM;
1335         if (!unix_may_send(sk, other))
1336                 goto out_unlock;
1337
1338         if (sock_flag(other, SOCK_DEAD)) {
1339                 /*
1340                  *      Check with 1003.1g - what should
1341                  *      datagram error
1342                  */
1343                 unix_state_runlock(other);
1344                 sock_put(other);
1345
1346                 err = 0;
1347                 unix_state_wlock(sk);
1348                 if (unix_peer(sk) == other) {
1349                         unix_peer(sk)=NULL;
1350                         unix_state_wunlock(sk);
1351
1352                         unix_dgram_disconnected(sk, other);
1353                         sock_put(other);
1354                         err = -ECONNREFUSED;
1355                 } else {
1356                         unix_state_wunlock(sk);
1357                 }
1358
1359                 other = NULL;
1360                 if (err)
1361                         goto out_free;
1362                 goto restart;
1363         }
1364
1365         err = -EPIPE;
1366         if (other->sk_shutdown & RCV_SHUTDOWN)
1367                 goto out_unlock;
1368
1369         if (sk->sk_type != SOCK_SEQPACKET) {
1370                 err = security_unix_may_send(sk->sk_socket, other->sk_socket);
1371                 if (err)
1372                         goto out_unlock;
1373         }
1374
1375         if (unix_peer(other) != sk &&
1376             (skb_queue_len(&other->sk_receive_queue) >
1377              other->sk_max_ack_backlog)) {
1378                 if (!timeo) {
1379                         err = -EAGAIN;
1380                         goto out_unlock;
1381                 }
1382
1383                 timeo = unix_wait_for_peer(other, timeo);
1384
1385                 err = sock_intr_errno(timeo);
1386                 if (signal_pending(current))
1387                         goto out_free;
1388
1389                 goto restart;
1390         }
1391
1392         skb_queue_tail(&other->sk_receive_queue, skb);
1393         unix_state_runlock(other);
1394         other->sk_data_ready(other, len);
1395         sock_put(other);
1396         scm_destroy(siocb->scm);
1397         return len;
1398
1399 out_unlock:
1400         unix_state_runlock(other);
1401 out_free:
1402         kfree_skb(skb);
1403 out:
1404         if (other)
1405                 sock_put(other);
1406         scm_destroy(siocb->scm);
1407         return err;
1408 }
1409
1410                 
1411 static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock,
1412                                struct msghdr *msg, size_t len)
1413 {
1414         struct sock_iocb *siocb = kiocb_to_siocb(kiocb);
1415         struct sock *sk = sock->sk;
1416         struct sock *other = NULL;
1417         struct sockaddr_un *sunaddr=msg->msg_name;
1418         int err,size;
1419         struct sk_buff *skb;
1420         int sent=0;
1421         struct scm_cookie tmp_scm;
1422
1423         if (NULL == siocb->scm)
1424                 siocb->scm = &tmp_scm;
1425         err = scm_send(sock, msg, siocb->scm);
1426         if (err < 0)
1427                 return err;
1428
1429         err = -EOPNOTSUPP;
1430         if (msg->msg_flags&MSG_OOB)
1431                 goto out_err;
1432
1433         if (msg->msg_namelen) {
1434                 err = sk->sk_state == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP;
1435                 goto out_err;
1436         } else {
1437                 sunaddr = NULL;
1438                 err = -ENOTCONN;
1439                 other = unix_peer_get(sk);
1440                 if (!other)
1441                         goto out_err;
1442         }
1443
1444         if (sk->sk_shutdown & SEND_SHUTDOWN)
1445                 goto pipe_err;
1446
1447         while(sent < len)
1448         {
1449                 /*
1450                  *      Optimisation for the fact that under 0.01% of X messages typically
1451                  *      need breaking up.
1452                  */
1453
1454                 size=len-sent;
1455
1456                 /* Keep two messages in the pipe so it schedules better */
1457                 if (size > sk->sk_sndbuf / 2 - 64)
1458                         size = sk->sk_sndbuf / 2 - 64;
1459
1460                 if (size > SKB_MAX_ALLOC)
1461                         size = SKB_MAX_ALLOC;
1462                         
1463                 /*
1464                  *      Grab a buffer
1465                  */
1466                  
1467                 skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
1468
1469                 if (skb==NULL)
1470                         goto out_err;
1471
1472                 /*
1473                  *      If you pass two values to the sock_alloc_send_skb
1474                  *      it tries to grab the large buffer with GFP_NOFS
1475                  *      (which can fail easily), and if it fails grab the
1476                  *      fallback size buffer which is under a page and will
1477                  *      succeed. [Alan]
1478                  */
1479                 size = min_t(int, size, skb_tailroom(skb));
1480
1481                 memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
1482                 if (siocb->scm->fp)
1483                         unix_attach_fds(siocb->scm, skb);
1484
1485                 if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
1486                         kfree_skb(skb);
1487                         goto out_err;
1488                 }
1489
1490                 unix_state_rlock(other);
1491
1492                 if (sock_flag(other, SOCK_DEAD) ||
1493                     (other->sk_shutdown & RCV_SHUTDOWN))
1494                         goto pipe_err_free;
1495
1496                 skb_queue_tail(&other->sk_receive_queue, skb);
1497                 unix_state_runlock(other);
1498                 other->sk_data_ready(other, size);
1499                 sent+=size;
1500         }
1501         sock_put(other);
1502
1503         scm_destroy(siocb->scm);
1504         siocb->scm = NULL;
1505
1506         return sent;
1507
1508 pipe_err_free:
1509         unix_state_runlock(other);
1510         kfree_skb(skb);
1511 pipe_err:
1512         if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL))
1513                 send_sig(SIGPIPE,current,0);
1514         err = -EPIPE;
1515 out_err:
1516         if (other)
1517                 sock_put(other);
1518         scm_destroy(siocb->scm);
1519         siocb->scm = NULL;
1520         return sent ? : err;
1521 }
1522
1523 static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock,
1524                                   struct msghdr *msg, size_t len)
1525 {
1526         int err;
1527         struct sock *sk = sock->sk;
1528         
1529         err = sock_error(sk);
1530         if (err)
1531                 return err;
1532
1533         if (sk->sk_state != TCP_ESTABLISHED)
1534                 return -ENOTCONN;
1535
1536         if (msg->msg_namelen)
1537                 msg->msg_namelen = 0;
1538
1539         return unix_dgram_sendmsg(kiocb, sock, msg, len);
1540 }
1541                                                                                             
1542 static void unix_copy_addr(struct msghdr *msg, struct sock *sk)
1543 {
1544         struct unix_sock *u = unix_sk(sk);
1545
1546         msg->msg_namelen = 0;
1547         if (u->addr) {
1548                 msg->msg_namelen = u->addr->len;
1549                 memcpy(msg->msg_name, u->addr->name, u->addr->len);
1550         }
1551 }
1552
1553 static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
1554                               struct msghdr *msg, size_t size,
1555                               int flags)
1556 {
1557         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1558         struct scm_cookie tmp_scm;
1559         struct sock *sk = sock->sk;
1560         struct unix_sock *u = unix_sk(sk);
1561         int noblock = flags & MSG_DONTWAIT;
1562         struct sk_buff *skb;
1563         int err;
1564
1565         err = -EOPNOTSUPP;
1566         if (flags&MSG_OOB)
1567                 goto out;
1568
1569         msg->msg_namelen = 0;
1570
1571         down(&u->readsem);
1572
1573         skb = skb_recv_datagram(sk, flags, noblock, &err);
1574         if (!skb)
1575                 goto out_unlock;
1576
1577         wake_up_interruptible(&u->peer_wait);
1578
1579         if (msg->msg_name)
1580                 unix_copy_addr(msg, skb->sk);
1581
1582         if (size > skb->len)
1583                 size = skb->len;
1584         else if (size < skb->len)
1585                 msg->msg_flags |= MSG_TRUNC;
1586
1587         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, size);
1588         if (err)
1589                 goto out_free;
1590
1591         if (!siocb->scm) {
1592                 siocb->scm = &tmp_scm;
1593                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1594         }
1595         siocb->scm->creds = *UNIXCREDS(skb);
1596
1597         if (!(flags & MSG_PEEK))
1598         {
1599                 if (UNIXCB(skb).fp)
1600                         unix_detach_fds(siocb->scm, skb);
1601         }
1602         else 
1603         {
1604                 /* It is questionable: on PEEK we could:
1605                    - do not return fds - good, but too simple 8)
1606                    - return fds, and do not return them on read (old strategy,
1607                      apparently wrong)
1608                    - clone fds (I chose it for now, it is the most universal
1609                      solution)
1610                 
1611                    POSIX 1003.1g does not actually define this clearly
1612                    at all. POSIX 1003.1g doesn't define a lot of things
1613                    clearly however!                  
1614                    
1615                 */
1616                 if (UNIXCB(skb).fp)
1617                         siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1618         }
1619         err = size;
1620
1621         scm_recv(sock, msg, siocb->scm, flags);
1622
1623 out_free:
1624         skb_free_datagram(sk,skb);
1625 out_unlock:
1626         up(&u->readsem);
1627 out:
1628         return err;
1629 }
1630
1631 /*
1632  *      Sleep until data has arrive. But check for races..
1633  */
1634  
1635 static long unix_stream_data_wait(struct sock * sk, long timeo)
1636 {
1637         DEFINE_WAIT(wait);
1638
1639         unix_state_rlock(sk);
1640
1641         for (;;) {
1642                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1643
1644                 if (skb_queue_len(&sk->sk_receive_queue) ||
1645                     sk->sk_err ||
1646                     (sk->sk_shutdown & RCV_SHUTDOWN) ||
1647                     signal_pending(current) ||
1648                     !timeo)
1649                         break;
1650
1651                 set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1652                 unix_state_runlock(sk);
1653                 timeo = schedule_timeout(timeo);
1654                 unix_state_rlock(sk);
1655                 clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1656         }
1657
1658         finish_wait(sk->sk_sleep, &wait);
1659         unix_state_runlock(sk);
1660         return timeo;
1661 }
1662
1663
1664
1665 static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
1666                                struct msghdr *msg, size_t size,
1667                                int flags)
1668 {
1669         struct sock_iocb *siocb = kiocb_to_siocb(iocb);
1670         struct scm_cookie tmp_scm;
1671         struct sock *sk = sock->sk;
1672         struct unix_sock *u = unix_sk(sk);
1673         struct sockaddr_un *sunaddr=msg->msg_name;
1674         int copied = 0;
1675         int check_creds = 0;
1676         int target;
1677         int err = 0;
1678         long timeo;
1679
1680         err = -EINVAL;
1681         if (sk->sk_state != TCP_ESTABLISHED)
1682                 goto out;
1683
1684         err = -EOPNOTSUPP;
1685         if (flags&MSG_OOB)
1686                 goto out;
1687
1688         target = sock_rcvlowat(sk, flags&MSG_WAITALL, size);
1689         timeo = sock_rcvtimeo(sk, flags&MSG_DONTWAIT);
1690
1691         msg->msg_namelen = 0;
1692
1693         /* Lock the socket to prevent queue disordering
1694          * while sleeps in memcpy_tomsg
1695          */
1696
1697         if (!siocb->scm) {
1698                 siocb->scm = &tmp_scm;
1699                 memset(&tmp_scm, 0, sizeof(tmp_scm));
1700         }
1701
1702         down(&u->readsem);
1703
1704         do
1705         {
1706                 int chunk;
1707                 struct sk_buff *skb;
1708
1709                 skb = skb_dequeue(&sk->sk_receive_queue);
1710                 if (skb==NULL)
1711                 {
1712                         if (copied >= target)
1713                                 break;
1714
1715                         /*
1716                          *      POSIX 1003.1g mandates this order.
1717                          */
1718                          
1719                         if ((err = sock_error(sk)) != 0)
1720                                 break;
1721                         if (sk->sk_shutdown & RCV_SHUTDOWN)
1722                                 break;
1723                         err = -EAGAIN;
1724                         if (!timeo)
1725                                 break;
1726                         up(&u->readsem);
1727
1728                         timeo = unix_stream_data_wait(sk, timeo);
1729
1730                         if (signal_pending(current)) {
1731                                 err = sock_intr_errno(timeo);
1732                                 goto out;
1733                         }
1734                         down(&u->readsem);
1735                         continue;
1736                 }
1737
1738                 if (check_creds) {
1739                         /* Never glue messages from different writers */
1740                         if (memcmp(UNIXCREDS(skb), &siocb->scm->creds, sizeof(siocb->scm->creds)) != 0) {
1741                                 skb_queue_head(&sk->sk_receive_queue, skb);
1742                                 break;
1743                         }
1744                 } else {
1745                         /* Copy credentials */
1746                         siocb->scm->creds = *UNIXCREDS(skb);
1747                         check_creds = 1;
1748                 }
1749
1750                 /* Copy address just once */
1751                 if (sunaddr)
1752                 {
1753                         unix_copy_addr(msg, skb->sk);
1754                         sunaddr = NULL;
1755                 }
1756
1757                 chunk = min_t(unsigned int, skb->len, size);
1758                 if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
1759                         skb_queue_head(&sk->sk_receive_queue, skb);
1760                         if (copied == 0)
1761                                 copied = -EFAULT;
1762                         break;
1763                 }
1764                 copied += chunk;
1765                 size -= chunk;
1766
1767                 /* Mark read part of skb as used */
1768                 if (!(flags & MSG_PEEK))
1769                 {
1770                         skb_pull(skb, chunk);
1771
1772                         if (UNIXCB(skb).fp)
1773                                 unix_detach_fds(siocb->scm, skb);
1774
1775                         /* put the skb back if we didn't use it up.. */
1776                         if (skb->len)
1777                         {
1778                                 skb_queue_head(&sk->sk_receive_queue, skb);
1779                                 break;
1780                         }
1781
1782                         kfree_skb(skb);
1783
1784                         if (siocb->scm->fp)
1785                                 break;
1786                 }
1787                 else
1788                 {
1789                         /* It is questionable, see note in unix_dgram_recvmsg.
1790                          */
1791                         if (UNIXCB(skb).fp)
1792                                 siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp);
1793
1794                         /* put message back and return */
1795                         skb_queue_head(&sk->sk_receive_queue, skb);
1796                         break;
1797                 }
1798         } while (size);
1799
1800         up(&u->readsem);
1801         scm_recv(sock, msg, siocb->scm, flags);
1802 out:
1803         return copied ? : err;
1804 }
1805
1806 static int unix_shutdown(struct socket *sock, int mode)
1807 {
1808         struct sock *sk = sock->sk;
1809         struct sock *other;
1810
1811         mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN);
1812
1813         if (mode) {
1814                 unix_state_wlock(sk);
1815                 sk->sk_shutdown |= mode;
1816                 other=unix_peer(sk);
1817                 if (other)
1818                         sock_hold(other);
1819                 unix_state_wunlock(sk);
1820                 sk->sk_state_change(sk);
1821
1822                 if (other &&
1823                         (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET)) {
1824
1825                         int peer_mode = 0;
1826
1827                         if (mode&RCV_SHUTDOWN)
1828                                 peer_mode |= SEND_SHUTDOWN;
1829                         if (mode&SEND_SHUTDOWN)
1830                                 peer_mode |= RCV_SHUTDOWN;
1831                         unix_state_wlock(other);
1832                         other->sk_shutdown |= peer_mode;
1833                         unix_state_wunlock(other);
1834                         other->sk_state_change(other);
1835                         read_lock(&other->sk_callback_lock);
1836                         if (peer_mode == SHUTDOWN_MASK)
1837                                 sk_wake_async(other,1,POLL_HUP);
1838                         else if (peer_mode & RCV_SHUTDOWN)
1839                                 sk_wake_async(other,1,POLL_IN);
1840                         read_unlock(&other->sk_callback_lock);
1841                 }
1842                 if (other)
1843                         sock_put(other);
1844         }
1845         return 0;
1846 }
1847
1848 static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1849 {
1850         struct sock *sk = sock->sk;
1851         long amount=0;
1852         int err;
1853
1854         switch(cmd)
1855         {
1856                 case SIOCOUTQ:
1857                         amount = atomic_read(&sk->sk_wmem_alloc);
1858                         err = put_user(amount, (int __user *)arg);
1859                         break;
1860                 case SIOCINQ:
1861                 {
1862                         struct sk_buff *skb;
1863
1864                         if (sk->sk_state == TCP_LISTEN) {
1865                                 err = -EINVAL;
1866                                 break;
1867                         }
1868
1869                         spin_lock(&sk->sk_receive_queue.lock);
1870                         if (sk->sk_type == SOCK_STREAM ||
1871                             sk->sk_type == SOCK_SEQPACKET) {
1872                                 skb_queue_walk(&sk->sk_receive_queue, skb)
1873                                         amount += skb->len;
1874                         } else {
1875                                 skb = skb_peek(&sk->sk_receive_queue);
1876                                 if (skb)
1877                                         amount=skb->len;
1878                         }
1879                         spin_unlock(&sk->sk_receive_queue.lock);
1880                         err = put_user(amount, (int __user *)arg);
1881                         break;
1882                 }
1883
1884                 default:
1885                         err = dev_ioctl(cmd, (void __user *)arg);
1886                         break;
1887         }
1888         return err;
1889 }
1890
1891 static unsigned int unix_poll(struct file * file, struct socket *sock, poll_table *wait)
1892 {
1893         struct sock *sk = sock->sk;
1894         unsigned int mask;
1895
1896         poll_wait(file, sk->sk_sleep, wait);
1897         mask = 0;
1898
1899         /* exceptional events? */
1900         if (sk->sk_err)
1901                 mask |= POLLERR;
1902         if (sk->sk_shutdown == SHUTDOWN_MASK)
1903                 mask |= POLLHUP;
1904
1905         /* readable? */
1906         if (!skb_queue_empty(&sk->sk_receive_queue) ||
1907             (sk->sk_shutdown & RCV_SHUTDOWN))
1908                 mask |= POLLIN | POLLRDNORM;
1909
1910         /* Connection-based need to check for termination and startup */
1911         if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && sk->sk_state == TCP_CLOSE)
1912                 mask |= POLLHUP;
1913
1914         /*
1915          * we set writable also when the other side has shut down the
1916          * connection. This prevents stuck sockets.
1917          */
1918         if (unix_writable(sk))
1919                 mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
1920
1921         return mask;
1922 }
1923
1924
1925 #ifdef CONFIG_PROC_FS
1926 static struct sock *unix_seq_idx(int *iter, loff_t pos)
1927 {
1928         loff_t off = 0;
1929         struct sock *s;
1930
1931         for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) {
1932                 if (off == pos) 
1933                         return s;
1934                 ++off;
1935         }
1936         return NULL;
1937 }
1938
1939
1940 static void *unix_seq_start(struct seq_file *seq, loff_t *pos)
1941 {
1942         read_lock(&unix_table_lock);
1943         return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1);
1944 }
1945
1946 static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1947 {
1948         ++*pos;
1949
1950         if (v == (void *)1) 
1951                 return first_unix_socket(seq->private);
1952         return next_unix_socket(seq->private, v);
1953 }
1954
1955 static void unix_seq_stop(struct seq_file *seq, void *v)
1956 {
1957         read_unlock(&unix_table_lock);
1958 }
1959
1960 static int unix_seq_show(struct seq_file *seq, void *v)
1961 {
1962         
1963         if (v == (void *)1)
1964                 seq_puts(seq, "Num       RefCount Protocol Flags    Type St "
1965                          "Inode Path\n");
1966         else {
1967                 struct sock *s = v;
1968                 struct unix_sock *u = unix_sk(s);
1969                 unix_state_rlock(s);
1970
1971                 seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu",
1972                         s,
1973                         atomic_read(&s->sk_refcnt),
1974                         0,
1975                         s->sk_state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
1976                         s->sk_type,
1977                         s->sk_socket ?
1978                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTED : SS_UNCONNECTED) :
1979                         (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING),
1980                         sock_i_ino(s));
1981
1982                 if (u->addr) {
1983                         int i, len;
1984                         seq_putc(seq, ' ');
1985
1986                         i = 0;
1987                         len = u->addr->len - sizeof(short);
1988                         if (!UNIX_ABSTRACT(s))
1989                                 len--;
1990                         else {
1991                                 seq_putc(seq, '@');
1992                                 i++;
1993                         }
1994                         for ( ; i < len; i++)
1995                                 seq_putc(seq, u->addr->name->sun_path[i]);
1996                 }
1997                 unix_state_runlock(s);
1998                 seq_putc(seq, '\n');
1999         }
2000
2001         return 0;
2002 }
2003
2004 static struct seq_operations unix_seq_ops = {
2005         .start  = unix_seq_start,
2006         .next   = unix_seq_next,
2007         .stop   = unix_seq_stop,
2008         .show   = unix_seq_show,
2009 };
2010
2011
2012 static int unix_seq_open(struct inode *inode, struct file *file)
2013 {
2014         struct seq_file *seq;
2015         int rc = -ENOMEM;
2016         int *iter = kmalloc(sizeof(int), GFP_KERNEL);
2017
2018         if (!iter)
2019                 goto out;
2020
2021         rc = seq_open(file, &unix_seq_ops);
2022         if (rc)
2023                 goto out_kfree;
2024
2025         seq          = file->private_data;
2026         seq->private = iter;
2027         *iter = 0;
2028 out:
2029         return rc;
2030 out_kfree:
2031         kfree(iter);
2032         goto out;
2033 }
2034
2035 static struct file_operations unix_seq_fops = {
2036         .owner          = THIS_MODULE,
2037         .open           = unix_seq_open,
2038         .read           = seq_read,
2039         .llseek         = seq_lseek,
2040         .release        = seq_release_private,
2041 };
2042
2043 #endif
2044
2045 static struct net_proto_family unix_family_ops = {
2046         .family = PF_UNIX,
2047         .create = unix_create,
2048         .owner  = THIS_MODULE,
2049 };
2050
2051 #ifdef CONFIG_SYSCTL
2052 extern void unix_sysctl_register(void);
2053 extern void unix_sysctl_unregister(void);
2054 #else
2055 static inline void unix_sysctl_register(void) {}
2056 static inline void unix_sysctl_unregister(void) {}
2057 #endif
2058
2059 static int __init af_unix_init(void)
2060 {
2061         int rc = -1;
2062         struct sk_buff *dummy_skb;
2063
2064         if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) {
2065                 printk(KERN_CRIT "%s: panic\n", __FUNCTION__);
2066                 goto out;
2067         }
2068
2069         rc = proto_register(&unix_proto, 1);
2070         if (rc != 0) {
2071                 printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n",
2072                        __FUNCTION__);
2073                 goto out;
2074         }
2075
2076         sock_register(&unix_family_ops);
2077 #ifdef CONFIG_PROC_FS
2078         proc_net_fops_create("unix", 0, &unix_seq_fops);
2079 #endif
2080         unix_sysctl_register();
2081 out:
2082         return rc;
2083 }
2084
2085 static void __exit af_unix_exit(void)
2086 {
2087         sock_unregister(PF_UNIX);
2088         unix_sysctl_unregister();
2089         proc_net_remove("unix");
2090         proto_unregister(&unix_proto);
2091 }
2092
2093 module_init(af_unix_init);
2094 module_exit(af_unix_exit);
2095
2096 MODULE_LICENSE("GPL");
2097 MODULE_ALIAS_NETPROTO(PF_UNIX);